full_data <- readRDS('data/full_data.rds')
daily_full <- readRDS('data/daily_full.rds')

Explore some summary statistics

Fixed Monitor stats

base_monitor_stat <- daily_full %>%
  group_by(Monitor) %>%
  summarise('Closest refinery' = unique(Refinery),
            'Distance to nearest refinery' = unique(MinDist),
            'Angle to refinery' = unique(Converted_Angle),
            'Distance to nearest wrp' = unique(dist_wrp),
            'Capacity of nearest wrp' = unique(capacity),
            'Angle to wrp' = unique(wrp_angle),
            'Distance to Dominguez Channel' = unique(dist_dc),
            'Elevation' = unique(elevation),
            'Enhanced Vegetation Index' = unique(EVI))

knitr::kable(base_monitor_stat, digits = 2, format = 'html')
Monitor Closest refinery Distance to nearest refinery Angle to refinery Distance to nearest wrp Capacity of nearest wrp Angle to wrp Distance to Dominguez Channel Elevation Enhanced Vegetation Index
213th & Chico Marathon (Carson) 2873.05 306 4297.10 400 213.09 50.00 7 0.12
Elm Ave Torrance Refinery 1716.47 88 8414.89 400 142.20 2020.45 27 0.25
First Methodist Phillips 66 (Wilmington) 2119.19 209 2456.12 400 355.45 3791.56 14 0.21
G Street Phillips 66 (Wilmington) 758.68 225 2940.01 400 356.20 3748.28 8 0.09
Guenser Park Torrance Refinery 2541.89 231 7701.97 400 159.18 374.61 16 0.14
Harbor Park Phillips 66 (Wilmington) 1478.91 265 2012.17 400 5.56 4262.03 12 0.60
Hudson Marathon (Wilmington) 1308.28 198 5919.88 400 271.99 705.23 8 0.14
Inner Port Valero Refinery 1988.18 192 5969.55 15 227.90 1936.68 5 0.04
Judson Marathon (Carson) 2792.00 338 2692.42 400 212.89 1481.34 13 0.14
Manhattan Chevron El Segundo 2275.17 109 5462.08 850 325.46 6144.64 42 0.19
North HS Torrance Refinery 1371.27 296 5740.63 400 130.29 4199.05 32 0.13
St. Luke Marathon (Carson) 2713.63 189 6910.05 400 255.89 1789.55 10 0.17
West HS Torrance Refinery 3592.78 11 9020.27 400 122.68 4869.95 31 0.17

Since Feb 2022

sincefeb2022_stat <- daily_full %>%
  filter(day >= '2022-02-01') %>%
  group_by(Monitor) %>%
  summarise('Daily observations' = n(),
            'Max Daily Max' = max(H2S_daily_max, na.rm=T),
            'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
            'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
            'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
            'Average Wind Speed' = mean(ws_avg, na.rm=T),
            'Average Wind Direction' = as.numeric(mean(circular(wd_avg, 
                                                                units = 'degrees'), 
                                                       na.rm=T)),
            'Average daily odor complaints within county' = mean(num_odor_complaints, na.rm=T),
            'Active wells 1km' = mean(active_1km, na.rm=T)) %>%
  mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0, 
                                            `Average Wind Direction`+360, 
                                            `Average Wind Direction`))

knitr::kable(sincefeb2022_stat, digits = 2, format = 'html')
Monitor Daily observations Max Daily Max Max Daily Average Avg Daily Max Avg Daily Average Average Wind Speed Average Wind Direction Average daily odor complaints within county Active wells 1km
Elm Ave 70 5.10 3.10 2.79 1.33 4.73 245.23 0.00 44.67
First Methodist 282 14.72 2.28 2.56 0.81 3.18 271.33 0.06 43.39
G Street 282 39.18 2.43 3.89 0.93 4.20 321.32 0.06 43.39
Guenser Park 278 6.86 2.66 2.04 0.94 4.27 260.90 0.01 43.41
Harbor Park 280 9.65 2.48 1.62 0.41 2.88 287.43 0.06 43.41
Hudson 283 173.00 9.46 3.62 1.10 3.33 44.38 0.12 43.40
Inner Port 282 52.63 5.36 6.48 0.91 4.19 215.54 0.05 43.40
Judson 283 9.52 2.39 1.80 0.51 3.63 275.45 0.40 43.40
Manhattan 283 5.39 1.10 0.87 0.36 3.28 226.77 0.05 43.40
North HS 70 4.00 1.60 1.49 0.50 4.70 243.79 0.04 44.67
St. Luke 282 11.62 2.71 2.34 0.72 3.50 323.55 0.12 43.40
West HS 69 2.90 0.88 1.15 0.41 4.70 242.49 0.04 44.70

During Disaster (October 2021 - December 2021)

disaster_stat <- daily_full %>%
  filter(year == '2021', month %in% c('10', '11', '12')) %>%
  group_by(Monitor) %>%
  summarise('Daily observations' = n(),
            'Max Daily Max' = max(H2S_daily_max, na.rm=T),
            'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
            'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
            'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
            'Average Wind Speed' = mean(ws_avg, na.rm=T),
            'Average Wind Direction' = as.numeric(mean(circular(wd_avg, 
                                                                units = 'degrees'), 
                                                       na.rm=T)),
            'Average daily odor complaints within county' = mean(num_odor_complaints, na.rm=T),
            'Active wells 1km' = mean(active_1km, na.rm=T)) %>%
  mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0, 
                                            `Average Wind Direction`+360, 
                                            `Average Wind Direction`))

knitr::kable(disaster_stat, digits = 2, format = 'html')
Monitor Daily observations Max Daily Max Max Daily Average Avg Daily Max Avg Daily Average Average Wind Speed Average Wind Direction Average daily odor complaints within county Active wells 1km
213th & Chico 47 3817.24 431.49 450.16 51.67 3.37 289.33 11.98 46.21
Elm Ave 92 63.50 8.15 7.73 1.34 3.74 234.91 0.46 46.34
First Methodist 56 149.47 14.11 13.80 2.07 2.67 296.88 1.86 46.34
G Street 56 48.67 6.61 10.66 1.95 3.66 331.85 1.86 46.34
Guenser Park 56 211.67 14.49 17.98 1.91 3.47 273.88 0.41 46.34
Harbor Park 56 75.93 9.37 8.49 1.20 2.36 304.86 0.55 46.34
Hudson 56 192.64 19.45 18.37 3.61 2.51 18.24 0.62 46.34
Inner Port 56 136.50 12.21 18.26 3.54 3.45 17.38 0.09 46.34
Judson 56 742.25 69.08 59.11 7.57 2.81 286.73 22.30 46.34
Manhattan 56 17.01 3.24 3.41 1.44 2.74 199.17 0.11 46.34
North HS 92 98.50 7.88 6.94 1.14 3.74 235.24 0.53 46.34
St. Luke 56 119.72 12.26 10.67 2.52 2.99 0.44 0.62 46.34
West HS 92 41.50 4.89 3.50 0.76 3.74 234.89 0.53 46.34

Normal Period (Jan 2020- May 2023) excluding disaster

normal_stat <- daily_full %>%
  filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>%
  group_by(Monitor) %>%
  summarise('Daily observations' = n(),
            'Max Daily Max' = max(H2S_daily_max, na.rm=T),
            'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
            'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
            'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
            'Average Wind Speed' = mean(ws_avg, na.rm=T),
            'Average Wind Direction' = as.numeric(mean(circular(wd_avg, 
                                                                units = 'degrees'), 
                                                       na.rm=T)),
            'Average daily odor complaints within county' = mean(num_odor_complaints, na.rm=T),
            'Active wells 1km' = mean(active_1km, na.rm=T)) %>%
  mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0, 
                                            `Average Wind Direction`+360, 
                                            `Average Wind Direction`))

knitr::kable(normal_stat, digits = 2, format = 'html')
Monitor Daily observations Max Daily Max Max Daily Average Avg Daily Max Avg Daily Average Average Wind Speed Average Wind Direction Average daily odor complaints within county Active wells 1km
213th & Chico 16 14.27 3.92 4.82 2.99 2.50 304.22 2.69 45.00
Elm Ave 731 11.40 3.20 2.19 0.97 4.53 241.58 0.02 48.59
First Methodist 462 17.58 2.81 2.65 0.83 3.24 274.30 0.13 45.19
G Street 460 382.76 18.66 4.57 0.91 4.27 323.56 0.10 45.19
Guenser Park 457 6.86 2.66 1.73 0.71 4.47 262.07 0.01 45.21
Harbor Park 460 10.34 2.67 1.71 0.45 2.91 292.68 0.06 45.21
Hudson 460 173.00 9.46 3.40 1.15 3.36 25.30 0.10 45.19
Inner Port 462 52.63 5.36 5.80 0.83 4.36 224.57 0.05 45.20
Judson 457 15.78 3.55 1.98 0.64 3.70 277.60 0.41 45.17
Manhattan 387 5.39 1.58 1.05 0.47 3.12 231.20 0.10 44.27
North HS 740 30.60 2.71 1.94 1.03 4.53 241.27 0.04 48.60
St. Luke 462 11.62 2.91 2.25 0.70 3.60 317.83 0.10 45.20
West HS 737 18.50 3.01 1.55 0.58 4.53 241.00 0.04 48.60

Explore some GAM models

Five minute H2S

h2s_model_a <- gam(H2S~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2), 
                   data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws + 
##     I(1/MinDist^2)
## 
## Parametric coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.172e+00  3.213e-03  364.72   <2e-16 ***
## year2023       -3.263e-01  2.800e-03 -116.54   <2e-16 ***
## wd_secQ2       -2.560e-01  3.751e-03  -68.23   <2e-16 ***
## wd_secQ3       -2.898e-01  3.892e-03  -74.46   <2e-16 ***
## wd_secQ4       -2.095e-01  3.492e-03  -59.99   <2e-16 ***
## ws             -5.424e-02  4.176e-04 -129.91   <2e-16 ***
## I(1/MinDist^2)  2.114e+05  2.329e+03   90.77   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df    F p-value    
## s(as.numeric(month)) 7.996      8 2496  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0946   Deviance explained = 9.47%
## GCV = 0.92965  Scale est. = 0.92963   n = 772251
plot(h2s_model_a)

gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4657067 248.8    8132744  434.4   7275617  388.6
## Vcells 104430763 796.8  270449874 2063.4 224923882 1716.1
h2s_model_b <- gam(H2S~s(as.numeric(month),bs='cc')+wd_sec+ws+I(1/MinDist^2)+
                   s(mon_utm_x, mon_utm_y, bs='tp', k = 8), 
                   data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + wd_sec + ws + I(1/MinDist^2) + 
##     s(mon_utm_x, mon_utm_y, bs = "tp", k = 8)
## 
## Parametric coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.013e+00  3.170e-03  319.43   <2e-16 ***
## wd_secQ2       -1.953e-01  3.695e-03  -52.86   <2e-16 ***
## wd_secQ3       -1.816e-01  3.905e-03  -46.51   <2e-16 ***
## wd_secQ4       -1.131e-01  3.486e-03  -32.43   <2e-16 ***
## ws             -6.919e-02  4.146e-04 -166.88   <2e-16 ***
## I(1/MinDist^2)  2.983e+05  2.856e+03  104.42   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                          edf Ref.df    F p-value    
## s(as.numeric(month))   7.999      8 1710  <2e-16 ***
## s(mon_utm_x,mon_utm_y) 6.999      7 6572  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.131   Deviance explained = 13.1%
## GCV = 0.89282  Scale est. = 0.8928    n = 772251
plot(h2s_model_b)

gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4669994 249.5    8132744  434.4   7275617  388.6
## Vcells 114946073 877.0  324619848 2476.7 317696763 2423.9
# Include converted angle and weekday
h2s_model_c <- gam(H2S ~ s(as.numeric(month),bs='cc') + year + wd_sec + ws + 
                     I(1/MinDist^2) + Converted_Angle + 
                     s(mon_utm_x, mon_utm_y, bs='tp', k = 10) + 
                     as.factor(weekday), 
                   data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_c)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws + 
##     I(1/MinDist^2) + Converted_Angle + s(mon_utm_x, mon_utm_y, 
##     bs = "tp", k = 10) + as.factor(weekday)
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           2.494e+00  2.321e-02  107.466  < 2e-16 ***
## year2023             -3.331e-01  2.866e-03 -116.237  < 2e-16 ***
## wd_secQ2             -1.863e-01  3.645e-03  -51.123  < 2e-16 ***
## wd_secQ3             -1.914e-01  3.848e-03  -49.753  < 2e-16 ***
## wd_secQ4             -1.096e-01  3.443e-03  -31.842  < 2e-16 ***
## ws                   -6.123e-02  4.080e-04 -150.068  < 2e-16 ***
## I(1/MinDist^2)       -3.291e-05  3.304e-07  -99.583  < 2e-16 ***
## Converted_Angle      -6.125e-03  1.104e-04  -55.496  < 2e-16 ***
## as.factor(weekday).L  9.861e-02  2.802e-03   35.190  < 2e-16 ***
## as.factor(weekday).Q -1.699e-01  2.810e-03  -60.461  < 2e-16 ***
## as.factor(weekday).C -3.825e-03  2.805e-03   -1.364    0.173    
## as.factor(weekday)^4 -2.049e-02  2.815e-03   -7.277 3.41e-13 ***
## as.factor(weekday)^5 -5.801e-02  2.809e-03  -20.654  < 2e-16 ***
## as.factor(weekday)^6 -2.508e-02  2.821e-03   -8.889  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                          edf Ref.df    F p-value    
## s(as.numeric(month))   7.998      8 2704  <2e-16 ***
## s(mon_utm_x,mon_utm_y) 8.999      9 6486  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 30/31
## R-sq.(adj) =  0.155   Deviance explained = 15.6%
## GCV = 0.86717  Scale est. = 0.86714   n = 772251
plot(h2s_model_c)

gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4671696 249.5    8132744  434.4   7275617  388.6
## Vcells 126969342 968.7  389623817 2972.6 379647656 2896.5
# Include converted angle and weekday
h2s_model_d <- gam(H2S ~ 
                     s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                     wd_sec + ws + downwind_ref + downwind_wrp +
                     I(1/MinDist^2) + dist_wrp + capacity +
                     Converted_Angle + 
                     s(mon_utm_x, mon_utm_y, bs='tp', k = 10) + 
                     monthly_oil_1km + monthly_gas_1km + active_1km
                     , 
                   data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_d)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) + 
##     dist_wrp + capacity + Converted_Angle + s(mon_utm_x, mon_utm_y, 
##     bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)          -2.598e+00  1.076e-01  -24.142  < 2e-16 ***
## year2023             -1.204e-01  6.381e-03  -18.862  < 2e-16 ***
## as.factor(weekday).L  1.056e-01  2.772e-03   38.089  < 2e-16 ***
## as.factor(weekday).Q -1.728e-01  2.779e-03  -62.180  < 2e-16 ***
## as.factor(weekday).C -6.411e-03  2.773e-03   -2.312   0.0208 *  
## as.factor(weekday)^4 -1.711e-02  2.783e-03   -6.146 7.94e-10 ***
## as.factor(weekday)^5 -5.627e-02  2.776e-03  -20.268  < 2e-16 ***
## as.factor(weekday)^6 -2.645e-02  2.788e-03   -9.486  < 2e-16 ***
## wd_secQ2             -1.503e-01  3.698e-03  -40.648  < 2e-16 ***
## wd_secQ3             -1.289e-01  3.923e-03  -32.865  < 2e-16 ***
## wd_secQ4             -7.449e-02  3.472e-03  -21.453  < 2e-16 ***
## ws                   -6.841e-02  4.074e-04 -167.932  < 2e-16 ***
## downwind_ref          1.374e-01  3.901e-03   35.229  < 2e-16 ***
## downwind_wrp         -1.702e-02  4.304e-03   -3.954 7.69e-05 ***
## I(1/MinDist^2)        2.788e-05  1.646e-06   16.943  < 2e-16 ***
## dist_wrp              3.799e-04  8.572e-06   44.322  < 2e-16 ***
## capacity              6.154e-03  8.181e-05   75.220  < 2e-16 ***
## Converted_Angle      -2.943e-03  1.099e-04  -26.770  < 2e-16 ***
## monthly_oil_1km       6.223e-05  2.765e-06   22.506  < 2e-16 ***
## monthly_gas_1km       1.292e-04  1.243e-05   10.390  < 2e-16 ***
## active_1km           -2.572e-02  1.946e-03  -13.218  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                          edf Ref.df    F p-value    
## s(as.numeric(month))   7.989  8.000 2000  <2e-16 ***
## s(mon_utm_x,mon_utm_y) 8.008  8.008 5964  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 36/38
## R-sq.(adj) =  0.175   Deviance explained = 17.6%
## GCV = 0.84667  Scale est. = 0.84663   n = 772251
plot(h2s_model_d)

gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4672591  249.6    8132744  434.4   7275617  388.6
## Vcells 143240372 1092.9  467628580 3567.8 389378290 2970.8
# Include elvation, EVI, 3D smooth, odor, dist to dom channel, temp, hum, precip
h2s_model_e <- readRDS('rfiles/h2s_model_e.rds')
# h2s_model_e <- gam(H2S ~ 
#                      s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
#                      wd_sec + ws + downwind_ref + downwind_wrp +
#                      I(1/MinDist^2) + I(1/dist_wrp^2) + capacity + wrp_angle +
#                      Converted_Angle + elevation + EVI + 
#                      s(mon_utm_x, mon_utm_y, bs='tp', k = 10) + 
#                      te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
#                      monthly_oil_1km + monthly_gas_1km + active_1km +
#                      num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + precip
#                      , 
#                    data = full_data %>% filter(day >= '2022-02-01'))
#saveRDS(h2s_model_e, 'rfiles/h2s_model_e.rds')
summary(h2s_model_e)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) + 
##     I(1/dist_wrp^2) + capacity + wrp_angle + Converted_Angle + 
##     elevation + EVI + s(mon_utm_x, mon_utm_y, bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + num_odor_complaints + I(1/dist_dc^2) + 
##     avg_temp + avg_hum + precip
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           2.285e+00  6.510e-01    3.510 0.000449 ***
## year2023              9.870e-02  1.712e-02    5.766 8.12e-09 ***
## as.factor(weekday).L  1.129e-01  2.852e-03   39.594  < 2e-16 ***
## as.factor(weekday).Q -1.747e-01  2.841e-03  -61.484  < 2e-16 ***
## as.factor(weekday).C  1.584e-02  2.824e-03    5.609 2.03e-08 ***
## as.factor(weekday)^4 -1.810e-02  2.828e-03   -6.400 1.55e-10 ***
## as.factor(weekday)^5 -5.583e-02  2.808e-03  -19.883  < 2e-16 ***
## as.factor(weekday)^6 -2.485e-02  2.822e-03   -8.806  < 2e-16 ***
## wd_secQ2             -1.031e-01  3.741e-03  -27.569  < 2e-16 ***
## wd_secQ3             -1.454e-01  3.962e-03  -36.702  < 2e-16 ***
## wd_secQ4             -1.146e-01  3.490e-03  -32.841  < 2e-16 ***
## ws                   -6.036e-02  4.158e-04 -145.169  < 2e-16 ***
## downwind_ref          1.098e-01  3.923e-03   28.001  < 2e-16 ***
## downwind_wrp          8.079e-03  4.326e-03    1.868 0.061802 .  
## I(1/MinDist^2)        1.707e-05  6.264e-06    2.726 0.006420 ** 
## I(1/dist_wrp^2)       1.056e-06  3.794e-07    2.783 0.005394 ** 
## capacity              7.434e-03  1.338e-03    5.556 2.75e-08 ***
## wrp_angle             4.063e-03  9.670e-04    4.201 2.65e-05 ***
## Converted_Angle      -1.586e-02  2.157e-03   -7.352 1.95e-13 ***
## elevation            -2.059e-01  1.293e-02  -15.920  < 2e-16 ***
## EVI                   2.975e+00  6.113e-01    4.868 1.13e-06 ***
## monthly_oil_1km       1.856e-04  6.710e-06   27.663  < 2e-16 ***
## monthly_gas_1km      -4.097e-04  3.277e-05  -12.504  < 2e-16 ***
## active_1km           -3.057e-02  2.530e-03  -12.081  < 2e-16 ***
## num_odor_complaints   2.821e-02  1.917e-03   14.718  < 2e-16 ***
## I(1/dist_dc^2)        1.019e-04  9.015e-06   11.303  < 2e-16 ***
## avg_temp              9.080e-03  4.090e-04   22.202  < 2e-16 ***
## avg_hum              -7.505e-03  1.056e-04  -71.073  < 2e-16 ***
## precip               -8.285e-02  5.145e-03  -16.102  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df       F
## s(as.numeric(month))                                     7.994  8.000 310.126
## s(mon_utm_x,mon_utm_y)                                   1.865  1.865   0.459
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 81.249 81.253 899.338
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(mon_utm_x,mon_utm_y)                                    0.542    
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 122/135
## R-sq.(adj) =  0.255   Deviance explained = 25.5%
## GCV = 0.80029  Scale est. = 0.80016   n = 712259
plot(h2s_model_e)

gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4680354  250.0    8132744  434.4   7275617  388.6
## Vcells 181371461 1383.8  467628580 3567.8 389378290 2970.8
# Model only the month of the event, October 2021
h2s_model_f <- readRDS('rfiles/h2s_model_f.rds')
# h2s_model_f <- gam(H2S ~ wd_sec + ws + downwind_ref + downwind_wrp +
#                      I(1/MinDist^2) + I(1/dist_wrp^2) + capacity + wrp_angle +
#                      Converted_Angle + elevation + EVI + 
#                      s(mon_utm_x, mon_utm_y, bs='tp', k = 10) + 
#                      te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
#                      monthly_oil_1km + monthly_gas_1km + active_1km +
#                      num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + precip, 
#                    data = full_data %>% filter(year == '2021', month == '10'))
# saveRDS(h2s_model_f, 'rfiles/h2s_model_f.rds')
summary(h2s_model_f)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) + 
##     I(1/dist_wrp^2) + capacity + wrp_angle + Converted_Angle + 
##     elevation + EVI + s(mon_utm_x, mon_utm_y, bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + num_odor_complaints + I(1/dist_dc^2) + 
##     avg_temp + avg_hum + precip
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          9.099e-07  4.542e-07   2.003  0.04516 *  
## wd_secQ2             3.129e+00  9.512e-01   3.289  0.00101 ** 
## wd_secQ3             6.611e+00  9.439e-01   7.004 2.50e-12 ***
## wd_secQ4             7.882e+00  8.910e-01   8.845  < 2e-16 ***
## ws                  -2.321e+00  1.159e-01 -20.026  < 2e-16 ***
## downwind_ref         1.639e+00  8.589e-01   1.908  0.05642 .  
## downwind_wrp         1.494e+00  1.077e+00   1.388  0.16529    
## I(1/MinDist^2)      -4.561e-03  1.475e-03  -3.093  0.00198 ** 
## I(1/dist_wrp^2)     -9.325e-04  5.499e-05 -16.959  < 2e-16 ***
## capacity             7.671e-01  1.692e-01   4.533 5.82e-06 ***
## wrp_angle            8.449e-01  1.825e-01   4.629 3.68e-06 ***
## Converted_Angle     -2.556e+00  3.095e-01  -8.258  < 2e-16 ***
## elevation           -2.652e+01  2.679e+00  -9.899  < 2e-16 ***
## EVI                  8.337e+02  1.155e+02   7.220 5.23e-13 ***
## monthly_oil_1km      1.533e-02  7.959e-03   1.926  0.05412 .  
## monthly_gas_1km      2.295e-03  1.205e-03   1.905  0.05680 .  
## active_1km           3.981e-05  2.000e-05   1.991  0.04647 *  
## num_odor_complaints  8.458e-01  6.767e-02  12.498  < 2e-16 ***
## I(1/dist_dc^2)       7.748e+00  3.991e-01  19.415  < 2e-16 ***
## avg_temp             1.349e+00  1.864e-01   7.236 4.66e-13 ***
## avg_hum              1.629e-01  5.258e-02   3.099  0.00194 ** 
## precip              -4.019e+01  7.628e+00  -5.269 1.37e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(mon_utm_x,mon_utm_y)                                   2.017   2.02  4.625
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 84.337  84.58 61.960
##                                                         p-value    
## s(mon_utm_x,mon_utm_y)                                  0.00868 ** 
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 108/120
## R-sq.(adj) =  0.265   Deviance explained = 26.5%
## GCV = 5448.6  Scale est. = 5441.4    n = 77510
plot(h2s_model_f)

gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4681877  250.1    8132744  434.4   7275617  388.6
## Vcells 185357855 1414.2  467628580 3567.8 389378290 2970.8
# Model data excluding the months of the event
h2s_model_g <- readRDS('rfiles/h2s_model_g.rds')
# h2s_model_g <- gam(H2S ~ s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
#                      wd_sec + ws + downwind_ref + downwind_wrp +
#                      I(1/MinDist^2) + I(1/dist_wrp^2) + capacity + wrp_angle +
#                      Converted_Angle + elevation + EVI + 
#                      s(mon_utm_x, mon_utm_y, bs='tp', k = 10) + 
#                      te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
#                      monthly_oil_1km + monthly_gas_1km + active_1km +
#                      num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + precip
#                      , 
#                    data = full_data %>% filter(!(year == '2021' & month %in% c('10', '11', '12'))))
# saveRDS(h2s_model_g, 'rfiles/h2s_model_g.rds')
summary(h2s_model_g)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) + 
##     I(1/dist_wrp^2) + capacity + wrp_angle + Converted_Angle + 
##     elevation + EVI + s(mon_utm_x, mon_utm_y, bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + num_odor_complaints + I(1/dist_dc^2) + 
##     avg_temp + avg_hum + precip
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           5.539e+00  3.610e-01   15.343  < 2e-16 ***
## year2021              3.707e-01  1.375e-02   26.967  < 2e-16 ***
## year2022             -5.225e-01  2.573e-02  -20.307  < 2e-16 ***
## year2023             -6.492e-01  2.547e-02  -25.492  < 2e-16 ***
## as.factor(weekday).L  8.394e-02  2.184e-03   38.426  < 2e-16 ***
## as.factor(weekday).Q -1.356e-01  2.182e-03  -62.151  < 2e-16 ***
## as.factor(weekday).C  1.406e-03  2.184e-03    0.644   0.5197    
## as.factor(weekday)^4 -2.889e-02  2.188e-03  -13.204  < 2e-16 ***
## as.factor(weekday)^5 -1.732e-02  2.181e-03   -7.943 1.98e-15 ***
## as.factor(weekday)^6 -5.050e-03  2.184e-03   -2.312   0.0208 *  
## wd_secQ2             -1.049e-01  2.952e-03  -35.543  < 2e-16 ***
## wd_secQ3             -1.878e-01  2.958e-03  -63.463  < 2e-16 ***
## wd_secQ4             -1.651e-01  2.796e-03  -59.057  < 2e-16 ***
## ws                   -4.613e-02  3.094e-04 -149.116  < 2e-16 ***
## downwind_ref          3.540e-02  2.614e-03   13.545  < 2e-16 ***
## downwind_wrp          2.151e-02  3.270e-03    6.577 4.80e-11 ***
## I(1/MinDist^2)       -4.864e-05  8.792e-06   -5.533 3.15e-08 ***
## I(1/dist_wrp^2)       1.483e-06  9.456e-07    1.568   0.1168    
## capacity              5.414e-03  5.868e-04    9.225  < 2e-16 ***
## wrp_angle             3.176e-03  7.460e-04    4.258 2.06e-05 ***
## Converted_Angle      -1.473e-02  1.020e-03  -14.451  < 2e-16 ***
## elevation            -1.200e-01  9.824e-03  -12.215  < 2e-16 ***
## EVI                   2.459e+00  4.676e-01    5.259 1.44e-07 ***
## monthly_oil_1km      -2.575e-05  2.232e-06  -11.534  < 2e-16 ***
## monthly_gas_1km      -1.831e-04  9.137e-06  -20.036  < 2e-16 ***
## active_1km           -3.881e-02  9.430e-04  -41.155  < 2e-16 ***
## num_odor_complaints   2.632e-02  1.466e-03   17.949  < 2e-16 ***
## I(1/dist_dc^2)        2.766e-03  3.971e-03    0.696   0.4861    
## avg_temp              6.183e-03  2.696e-04   22.937  < 2e-16 ***
## avg_hum              -6.301e-03  7.620e-05  -82.688  < 2e-16 ***
## precip               -7.811e-02  5.252e-03  -14.872  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(as.numeric(month))                                     7.999  8.000 2547.0
## s(mon_utm_x,mon_utm_y)                                   1.682  1.683  246.9
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 85.198 85.296 1455.0
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(mon_utm_x,mon_utm_y)                                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 129/137
## R-sq.(adj) =  0.166   Deviance explained = 16.6%
## GCV = 1.1485  Scale est. = 1.1484    n = 1696814
plot(h2s_model_g)

gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4684353  250.2    8132744  434.4   7275617  388.6
## Vcells 275649503 2103.1  467628580 3567.8 389378290 2970.8

Daily max H2S

# With month, year, wind sector/speed, dist to refinery, refinery
h2s_dm_model_a <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+year+wd_avg+ws_avg+
                        I(1/MinDist^2), 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + wd_avg + 
##     ws_avg + I(1/MinDist^2)
## 
## Parametric coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.031e+00  3.477e-01   8.717   <2e-16 ***
## year2023       -4.346e-01  2.197e-01  -1.978    0.048 *  
## wd_avg          6.613e-04  1.035e-03   0.639    0.523    
## ws_avg         -8.840e-02  6.339e-02  -1.394    0.163    
## I(1/MinDist^2)  7.658e-07  8.232e-08   9.303   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df     F  p-value    
## s(as.numeric(month)) 2.184      8 2.026 6.47e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 12/13
## R-sq.(adj) =  0.00874   Deviance explained = 1.07%
## GCV = 21.457  Scale est. = 21.407    n = 2664
plot(h2s_dm_model_a)

# Also with location of monitor
h2s_dm_model_b <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+wd_avg+ws_avg+
                        I(1/MinDist^2)+Refinery+s(monitor_lat, monitor_lon, bs='tp', k=10), 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + wd_avg + ws_avg + 
##     I(1/MinDist^2) + Refinery + s(monitor_lat, monitor_lon, bs = "tp", 
##     k = 10)
## 
## Parametric coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       9.418e+00  1.420e+01   0.663  0.50715    
## wd_avg                            3.025e-03  1.010e-03   2.994  0.00278 ** 
## ws_avg                           -3.111e-01  6.333e-02  -4.913 9.54e-07 ***
## I(1/MinDist^2)                   -6.123e-05  1.635e-03  -0.037  0.97012    
## RefineryMarathon (Carson)        -9.065e-01  1.761e+01  -0.051  0.95894    
## RefineryMarathon (Wilmington)    -3.737e+00  1.745e+01  -0.214  0.83047    
## RefineryPhillips 66 (Wilmington) -1.409e+01  1.626e+01  -0.866  0.38643    
## RefineryTorrance Refinery        -2.578e+00  1.400e+01  -0.184  0.85387    
## RefineryValero Refinery          -7.568e+00  1.766e+01  -0.429  0.66824    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                              edf Ref.df     F  p-value    
## s(as.numeric(month))       2.035  8.000 1.723 0.000203 ***
## s(monitor_lat,monitor_lon) 5.379  5.672 9.312 1.13e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 25/26
## R-sq.(adj) =  0.127   Deviance explained = 13.2%
## GCV = 18.968  Scale est. = 18.858    n = 2664
plot(h2s_dm_model_b)

# Also include angle to refinery
h2s_dm_model_c <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+wd_avg+ws_avg+
                        I(1/MinDist^2)+Converted_Angle+
                        s(monitor_lat, monitor_lon, bs='tp', k=10), 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_c)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + wd_avg + ws_avg + 
##     I(1/MinDist^2) + Converted_Angle + s(monitor_lat, monitor_lon, 
##     bs = "tp", k = 10)
## 
## Parametric coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      3.608e+00  8.828e-01   4.087  4.5e-05 ***
## wd_avg           2.924e-03  1.015e-03   2.882 0.003983 ** 
## ws_avg          -2.235e-01  6.205e-02  -3.602 0.000322 ***
## I(1/MinDist^2)  -8.626e-06  3.814e-05  -0.226 0.821082    
## Converted_Angle -3.444e-03  3.892e-03  -0.885 0.376317    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                              edf Ref.df      F  p-value    
## s(as.numeric(month))       2.113  8.000  2.042 4.88e-05 ***
## s(monitor_lat,monitor_lon) 7.071  7.919 41.137  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 21/22
## R-sq.(adj) =  0.117   Deviance explained = 12.1%
## GCV = 19.169  Scale est. = 19.074    n = 2664
plot(h2s_dm_model_c)

h2s_dm_model_d <- gam(H2S_daily_max ~ 
                         s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                         I(1/MinDist^2) + dist_wrp + capacity +
                         Converted_Angle + 
                         s(monitor_lat, monitor_lon, bs='tp', k = 10) + 
                         monthly_oil_1km + monthly_gas_1km + active_1km
                      , 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_d)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/MinDist^2) + dist_wrp + 
##     capacity + Converted_Angle + s(monitor_lat, monitor_lon, 
##     bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.696e+01  5.143e+00  -3.298 0.000986 ***
## year2023             -6.419e-01  4.082e-01  -1.573 0.115927    
## as.factor(weekday).L  4.659e-01  2.216e-01   2.103 0.035589 *  
## as.factor(weekday).Q -9.062e-01  2.223e-01  -4.077  4.7e-05 ***
## as.factor(weekday).C  8.727e-03  2.215e-01   0.039 0.968577    
## as.factor(weekday)^4 -1.728e-01  2.223e-01  -0.777 0.437033    
## as.factor(weekday)^5 -2.848e-01  2.217e-01  -1.285 0.198943    
## as.factor(weekday)^6 -1.696e-01  2.223e-01  -0.763 0.445474    
## wd_avg                3.371e-03  1.028e-03   3.279 0.001057 ** 
## ws_avg               -3.041e-01  6.608e-02  -4.601  4.4e-06 ***
## daily_downwind_ref    7.377e-01  3.989e-01   1.849 0.064527 .  
## I(1/MinDist^2)        2.246e-04  3.883e-04   0.578 0.563013    
## dist_wrp              2.258e-03  6.614e-04   3.415 0.000648 ***
## capacity              9.466e-03  1.166e-02   0.812 0.416776    
## Converted_Angle       9.685e-03  8.699e-03   1.113 0.265657    
## monthly_oil_1km       2.479e-04  1.548e-04   1.602 0.109297    
## monthly_gas_1km      -1.138e-03  6.554e-04  -1.736 0.082603 .  
## active_1km            3.974e-02  6.286e-02   0.632 0.527264    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                              edf Ref.df      F  p-value    
## s(as.numeric(month))       2.319  8.000  2.087 6.51e-05 ***
## s(monitor_lat,monitor_lon) 7.785  7.976 14.036  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 34/35
## R-sq.(adj) =  0.137   Deviance explained = 14.5%
## GCV = 18.829  Scale est. = 18.638    n = 2664
plot(h2s_dm_model_d)

# Try to include as many variables as possible
h2s_dm_model_e <- gam(H2S_daily_max ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                        capacity + daily_downwind_wrp + 
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
                           k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        elevation + EVI + num_odor_complaints + 
                        I(1/dist_dc^2) + avg_temp + avg_hum + precip, 
                      data = daily_full %>% filter(day >= '2022-02-01'))

summary(h2s_dm_model_e)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) + 
##     capacity + daily_downwind_wrp + s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp", 
##         "cc")) + monthly_oil_1km + monthly_gas_1km + active_1km + 
##     elevation + EVI + num_odor_complaints + I(1/dist_dc^2) + 
##     avg_temp + avg_hum + precip
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -1.184e+01  7.409e+00  -1.598  0.11020    
## year2023                  2.174e-01  1.024e+00   0.212  0.83183    
## as.character(weekday)Mon -6.411e-01  3.101e-01  -2.067  0.03882 *  
## as.character(weekday)Sat -7.266e-01  3.099e-01  -2.344  0.01913 *  
## as.character(weekday)Sun -1.279e+00  3.102e-01  -4.123 3.86e-05 ***
## as.character(weekday)Thu -2.864e-01  3.102e-01  -0.923  0.35608    
## as.character(weekday)Tue -1.762e-01  3.078e-01  -0.572  0.56720    
## as.character(weekday)Wed -4.809e-02  3.118e-01  -0.154  0.87745    
## wd_avg                    2.344e-03  1.084e-03   2.161  0.03078 *  
## ws_avg                   -2.294e-01  7.129e-02  -3.217  0.00131 ** 
## daily_downwind_ref        4.051e-01  3.944e-01   1.027  0.30441    
## I(1/dist_wrp^2)           4.422e-07  1.655e-06   0.267  0.78930    
## capacity                 -5.974e-03  2.648e-03  -2.256  0.02415 *  
## daily_downwind_wrp        1.783e-01  4.040e-01   0.441  0.65903    
## monthly_oil_1km           5.216e-04  2.741e-04   1.903  0.05714 .  
## monthly_gas_1km          -1.588e-03  1.987e-03  -0.799  0.42419    
## active_1km                2.959e-01  1.237e-01   2.393  0.01680 *  
## elevation                -1.318e-01  5.729e-02  -2.301  0.02146 *  
## EVI                      -3.691e+00  7.407e-01  -4.983 6.68e-07 ***
## num_odor_complaints       1.128e-01  1.520e-01   0.742  0.45800    
## I(1/dist_dc^2)            3.870e-05  7.395e-05   0.523  0.60085    
## avg_temp                  6.252e-02  2.837e-02   2.204  0.02763 *  
## avg_hum                  -1.857e-02  7.398e-03  -2.511  0.01212 *  
## precip                    4.781e-01  4.334e-01   1.103  0.27009    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                               edf Ref.df     F
## s(as.numeric(month))                                    3.379e-09  8.000 0.000
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  4.037e+00  4.451 3.041
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 5.550e+01 76.000 1.469
##                                                          p-value    
## s(as.numeric(month))                                     0.85254    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   0.00945 ** 
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 3.23e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 115/117
## R-sq.(adj) =  0.161   Deviance explained = 18.7%
## GCV = 18.681  Scale est. = 18.109    n = 2664
plot(h2s_dm_model_e)

e <- getViz(h2s_dm_model_e)
plot(sm(e, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_dm_model_f <- gam(H2S_daily_max ~ as.character(weekday) +
                        wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                        capacity + daily_downwind_wrp + 
                        s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
                           k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        elevation + EVI + num_odor_complaints +
                        I(1/dist_dc^2) + avg_temp + avg_hum + precip, 
                      data = daily_full %>% filter(year == '2021', month %in% c('10', '11', '12')))

summary(h2s_dm_model_f)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ as.character(weekday) + wd_avg + ws_avg + daily_downwind_ref + 
##     I(1/dist_wrp^2) + capacity + daily_downwind_wrp + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km + elevation + EVI + num_odor_complaints + I(1/dist_dc^2) + 
##     avg_temp + avg_hum + precip
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -2.933615   1.112132  -2.638  0.00852 ** 
## as.character(weekday)Mon   3.939302  25.809815   0.153  0.87873    
## as.character(weekday)Sat  -1.274518  24.967489  -0.051  0.95930    
## as.character(weekday)Sun   6.518458  25.156102   0.259  0.79561    
## as.character(weekday)Thu  10.719203  25.476323   0.421  0.67406    
## as.character(weekday)Tue -12.039497  26.266684  -0.458  0.64683    
## as.character(weekday)Wed   9.587227  25.907187   0.370  0.71144    
## wd_avg                     0.079809   0.082575   0.966  0.33411    
## ws_avg                     0.102886   6.616534   0.016  0.98760    
## daily_downwind_ref       -30.773112  24.395106  -1.261  0.20754    
## I(1/dist_wrp^2)           -0.001388   0.000147  -9.442  < 2e-16 ***
## capacity                   6.856419   0.710540   9.650  < 2e-16 ***
## daily_downwind_wrp         8.299696  29.553392   0.281  0.77891    
## monthly_oil_1km            0.053059   0.115651   0.459  0.64652    
## monthly_gas_1km           -0.001199   0.314860  -0.004  0.99696    
## active_1km               -96.629305  36.632200  -2.638  0.00852 ** 
## elevation                 24.515412   7.897485   3.104  0.00198 ** 
## EVI                      804.034316 104.569368   7.689 4.69e-14 ***
## num_odor_complaints        3.663308   0.869852   4.211 2.85e-05 ***
## I(1/dist_dc^2)             9.401428   0.971411   9.678  < 2e-16 ***
## avg_temp                   2.543734   2.454075   1.037  0.30029    
## avg_hum                    0.217984   0.586008   0.372  0.71001    
## precip                    -6.489537  26.367207  -0.246  0.80566    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.929   8.99 12.175
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 52.052  80.00  4.366
##                                                         p-value    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 109/112
## R-sq.(adj) =  0.511   Deviance explained = 55.8%
## GCV =  37085  Scale est. = 33454     n = 827
plot(h2s_dm_model_f)

f <- getViz(h2s_dm_model_f)
plot(sm(f, 1)) + l_fitRaster() + l_fitContour() + l_points()

h2s_dm_model_g <- gam(H2S_daily_max ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                        wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                        capacity + daily_downwind_wrp +
                        s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
                           k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                        monthly_oil_1km + monthly_gas_1km + active_1km + 
                        elevation + EVI + num_odor_complaints +
                        I(1/dist_dc^2) + avg_temp + avg_hum + precip, 
                      data = daily_full %>% filter(!(year == '2021' & month %in% c('10', '11', '12'))))

summary(h2s_dm_model_g)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) + 
##     capacity + daily_downwind_wrp + s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp", 
##         "cc")) + monthly_oil_1km + monthly_gas_1km + active_1km + 
##     elevation + EVI + num_odor_complaints + I(1/dist_dc^2) + 
##     avg_temp + avg_hum + precip
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               2.916e+00  5.515e+00   0.529 0.597053    
## year2021                  8.579e-01  9.300e-01   0.922 0.356332    
## year2022                 -2.466e-01  1.422e+00  -0.173 0.862369    
## year2023                 -5.194e-01  1.452e+00  -0.358 0.720584    
## as.character(weekday)Mon -3.061e-01  2.768e-01  -1.106 0.268828    
## as.character(weekday)Sat -5.006e-01  2.749e-01  -1.821 0.068640 .  
## as.character(weekday)Sun -8.683e-01  2.753e-01  -3.153 0.001622 ** 
## as.character(weekday)Thu -4.797e-02  2.758e-01  -0.174 0.861931    
## as.character(weekday)Tue  2.499e-01  2.750e-01   0.909 0.363486    
## as.character(weekday)Wed  4.445e-02  2.770e-01   0.160 0.872503    
## wd_avg                    1.333e-03  1.003e-03   1.329 0.183975    
## ws_avg                   -2.430e-01  6.243e-02  -3.893 0.000100 ***
## daily_downwind_ref        1.440e-01  2.800e-01   0.514 0.607101    
## I(1/dist_wrp^2)          -3.044e-06  6.687e-07  -4.553 5.40e-06 ***
## capacity                  3.387e-04  6.443e-03   0.053 0.958080    
## daily_downwind_wrp       -5.676e-03  3.347e-01  -0.017 0.986471    
## monthly_oil_1km           9.574e-05  1.675e-04   0.572 0.567570    
## monthly_gas_1km           1.013e-04  6.476e-04   0.156 0.875724    
## active_1km                1.156e-01  5.775e-02   2.002 0.045331 *  
## elevation                -2.426e-01  6.877e-02  -3.528 0.000421 ***
## EVI                      -4.221e+00  1.023e+00  -4.125 3.76e-05 ***
## num_odor_complaints       3.987e-01  1.295e-01   3.079 0.002084 ** 
## I(1/dist_dc^2)            1.558e-03  1.652e-03   0.943 0.345664    
## avg_temp                  1.508e-03  2.045e-02   0.074 0.941227    
## avg_hum                  -2.471e-02  6.150e-03  -4.019 5.93e-05 ***
## precip                    4.173e-01  5.063e-01   0.824 0.409849    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                               edf Ref.df     F
## s(as.numeric(month))                                    1.256e-08  8.000 0.000
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  7.759e+00  8.001 4.529
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 4.023e+01 80.000 1.062
##                                                          p-value    
## s(as.numeric(month))                                       0.517    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  1.53e-05 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 9.00e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 121/123
## R-sq.(adj) =  0.0679   Deviance explained = 7.87%
## GCV = 33.746  Scale est. = 33.352    n = 6162
plot(h2s_dm_model_g)

g <- getViz(h2s_dm_model_g)
plot(sm(g, 2)) + l_fitRaster() + l_fitContour() + l_points()

Daily Average

# Look at daily average
h2s_da_model_a <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                         I(1/MinDist^2) + I(1/dist_wrp^2) + capacity +
                         Converted_Angle + 
                         s(monitor_lon, monitor_lat, bs='tp', k = 10) + 
                         monthly_oil_1km + monthly_gas_1km + active_1km, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/MinDist^2) + I(1/dist_wrp^2) + 
##     capacity + Converted_Angle + s(monitor_lon, monitor_lat, 
##     bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -4.742e+00  9.324e-01  -5.086 3.91e-07 ***
## year2023             -1.515e-01  5.680e-02  -2.667 0.007704 ** 
## as.factor(weekday).L  8.212e-02  2.360e-02   3.480 0.000509 ***
## as.factor(weekday).Q -1.599e-01  2.367e-02  -6.755 1.75e-11 ***
## as.factor(weekday).C  1.925e-02  2.357e-02   0.817 0.414075    
## as.factor(weekday)^4 -6.537e-03  2.364e-02  -0.277 0.782124    
## as.factor(weekday)^5 -4.980e-02  2.357e-02  -2.113 0.034702 *  
## as.factor(weekday)^6 -4.449e-02  2.363e-02  -1.883 0.059767 .  
## wd_avg                7.982e-04  1.096e-04   7.286 4.19e-13 ***
## ws_avg               -1.139e-01  7.202e-03 -15.809  < 2e-16 ***
## daily_downwind_ref    2.721e-01  4.253e-02   6.399 1.85e-10 ***
## I(1/MinDist^2)        2.940e-04  3.325e-05   8.844  < 2e-16 ***
## I(1/dist_wrp^2)      -1.625e-05  2.391e-06  -6.798 1.31e-11 ***
## capacity              1.565e-02  1.127e-03  13.885  < 2e-16 ***
## Converted_Angle      -2.730e-03  9.770e-04  -2.795 0.005234 ** 
## monthly_oil_1km       6.935e-05  2.246e-05   3.087 0.002043 ** 
## monthly_gas_1km       4.867e-05  1.016e-04   0.479 0.631782    
## active_1km           -2.583e-02  1.482e-02  -1.743 0.081492 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                              edf Ref.df     F p-value    
## s(as.numeric(month))       7.557      8 19.58  <2e-16 ***
## s(monitor_lon,monitor_lat) 8.988      9 85.28  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 33/35
## R-sq.(adj) =  0.421   Deviance explained = 42.8%
## GCV = 0.21298  Scale est. = 0.21037   n = 2664
plot(h2s_da_model_a)

a <- getViz(h2s_da_model_a)
plot(sm(a, 2)) + l_fitRaster() + l_fitContour() + l_points()

# Look at daily average
h2s_da_model_b <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                         I(1/MinDist^2) + I(1/dist_wrp^2) + capacity +
                         Converted_Angle + 
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         monthly_oil_1km + monthly_gas_1km + active_1km, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/MinDist^2) + I(1/dist_wrp^2) + 
##     capacity + Converted_Angle + s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -3.291e+00  8.757e-01  -3.758 0.000175 ***
## year2023             -1.515e-01  5.681e-02  -2.667 0.007708 ** 
## as.factor(weekday).L  8.212e-02  2.360e-02   3.480 0.000509 ***
## as.factor(weekday).Q -1.599e-01  2.367e-02  -6.755 1.76e-11 ***
## as.factor(weekday).C  1.925e-02  2.357e-02   0.817 0.414145    
## as.factor(weekday)^4 -6.549e-03  2.364e-02  -0.277 0.781748    
## as.factor(weekday)^5 -4.978e-02  2.357e-02  -2.112 0.034749 *  
## as.factor(weekday)^6 -4.449e-02  2.363e-02  -1.883 0.059817 .  
## wd_avg                7.982e-04  1.095e-04   7.286 4.20e-13 ***
## ws_avg               -1.138e-01  7.202e-03 -15.803  < 2e-16 ***
## daily_downwind_ref    2.721e-01  4.252e-02   6.400 1.84e-10 ***
## I(1/MinDist^2)        1.320e-04  1.904e-05   6.931 5.25e-12 ***
## I(1/dist_wrp^2)      -9.831e-06  1.268e-06  -7.752 1.29e-14 ***
## capacity              1.202e-02  8.420e-04  14.280  < 2e-16 ***
## Converted_Angle      -2.608e-03  9.596e-04  -2.718 0.006616 ** 
## monthly_oil_1km       6.932e-05  2.247e-05   3.086 0.002053 ** 
## monthly_gas_1km       4.888e-05  1.016e-04   0.481 0.630348    
## active_1km           -2.584e-02  1.482e-02  -1.743 0.081413 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                          edf Ref.df     F p-value    
## s(as.numeric(month))                   7.559      8 19.58  <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.981      9 85.03  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 33/35
## R-sq.(adj) =  0.421   Deviance explained = 42.8%
## GCV = 0.21297  Scale est. = 0.21037   n = 2664
plot(h2s_da_model_b)

b <- getViz(h2s_da_model_b)
plot(sm(b, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_c <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + capacity + 
                          I(1/dist_wrp^2) +
                          
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         monthly_oil_1km + monthly_gas_1km + active_1km, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_c)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     monthly_oil_1km + monthly_gas_1km + active_1km
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -4.092e+00  8.277e-01  -4.944 8.13e-07 ***
## year2023             -1.502e-01  5.687e-02  -2.642 0.008293 ** 
## as.factor(weekday).L  8.210e-02  2.362e-02   3.475 0.000518 ***
## as.factor(weekday).Q -1.601e-01  2.370e-02  -6.757 1.73e-11 ***
## as.factor(weekday).C  1.946e-02  2.359e-02   0.825 0.409569    
## as.factor(weekday)^4 -6.335e-03  2.366e-02  -0.268 0.788954    
## as.factor(weekday)^5 -5.012e-02  2.359e-02  -2.124 0.033761 *  
## as.factor(weekday)^6 -4.475e-02  2.365e-02  -1.892 0.058579 .  
## wd_avg                7.928e-04  1.097e-04   7.230 6.31e-13 ***
## ws_avg               -1.152e-01  7.196e-03 -16.005  < 2e-16 ***
## daily_downwind_ref    2.708e-01  4.257e-02   6.362 2.35e-10 ***
## capacity              1.264e-02  8.169e-04  15.474  < 2e-16 ***
## I(1/dist_wrp^2)      -1.222e-05  1.314e-06  -9.304  < 2e-16 ***
## monthly_oil_1km       6.987e-05  2.249e-05   3.107 0.001913 ** 
## monthly_gas_1km       4.838e-05  1.017e-04   0.476 0.634221    
## active_1km           -2.578e-02  1.484e-02  -1.737 0.082429 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                          edf Ref.df     F p-value    
## s(as.numeric(month))                   7.558      8 19.47  <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.992      9 90.61  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 32/33
## R-sq.(adj) =   0.42   Deviance explained = 42.6%
## GCV = 0.21339  Scale est. = 0.21086   n = 2664
plot(h2s_da_model_c)

c <- getViz(h2s_da_model_c)
plot(sm(c, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_d <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + capacity + 
                          I(1/dist_wrp^2) +
                          
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_d)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     monthly_oil_1km + monthly_gas_1km + active_1km + daily_downwind_wrp + 
##     elevation + EVI
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               1.080e+00  1.012e+00   1.068  0.28574    
## year2023                 -1.517e-01  5.678e-02  -2.671  0.00760 ** 
## as.character(weekday)Mon -9.035e-02  3.341e-02  -2.705  0.00688 ** 
## as.character(weekday)Sat -9.858e-02  3.336e-02  -2.955  0.00315 ** 
## as.character(weekday)Sun -1.974e-01  3.323e-02  -5.940 3.23e-09 ***
## as.character(weekday)Thu -4.795e-02  3.340e-02  -1.436  0.15121    
## as.character(weekday)Tue -8.359e-03  3.317e-02  -0.252  0.80104    
## as.character(weekday)Wed  3.880e-02  3.354e-02   1.157  0.24744    
## wd_avg                    7.977e-04  1.095e-04   7.283 4.30e-13 ***
## ws_avg                   -1.143e-01  7.206e-03 -15.858  < 2e-16 ***
## daily_downwind_ref        2.737e-01  4.248e-02   6.444 1.38e-10 ***
## capacity                  1.097e-03  1.436e-03   0.764  0.44488    
## I(1/dist_wrp^2)          -8.660e-08  5.329e-08  -1.625  0.10426    
## monthly_oil_1km           6.917e-05  2.246e-05   3.081  0.00209 ** 
## monthly_gas_1km           4.965e-05  1.015e-04   0.489  0.62483    
## active_1km               -2.562e-02  1.481e-02  -1.730  0.08375 .  
## daily_downwind_wrp        5.139e-02  4.297e-02   1.196  0.23182    
## elevation                -1.412e-02  8.007e-03  -1.764  0.07790 .  
## EVI                      -1.067e+00  1.446e-01  -7.374 2.21e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                          edf Ref.df     F p-value    
## s(as.numeric(month))                   7.553  8.000 19.57  <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 7.916  7.997 51.72  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 35/36
## R-sq.(adj) =  0.421   Deviance explained = 42.8%
## GCV = 0.21302  Scale est. = 0.21034   n = 2664
plot(h2s_da_model_d)

d <- getViz(h2s_da_model_d)
plot(sm(d, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_e <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + capacity + 
                          I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_e)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + daily_downwind_wrp + elevation + 
##     EVI
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -3.705e+00  7.312e-01  -5.067 4.33e-07 ***
## year2023                  3.857e-01  1.227e-01   3.144 0.001688 ** 
## as.character(weekday)Mon -9.050e-02  2.665e-02  -3.396 0.000695 ***
## as.character(weekday)Sat -9.599e-02  2.661e-02  -3.607 0.000316 ***
## as.character(weekday)Sun -1.993e-01  2.653e-02  -7.513 7.97e-14 ***
## as.character(weekday)Thu -4.584e-02  2.663e-02  -1.721 0.085311 .  
## as.character(weekday)Tue -1.143e-02  2.646e-02  -0.432 0.665634    
## as.character(weekday)Wed  3.472e-02  2.675e-02   1.298 0.194507    
## wd_avg                    6.304e-04  9.123e-05   6.910 6.10e-12 ***
## ws_avg                   -1.064e-01  6.042e-03 -17.612  < 2e-16 ***
## daily_downwind_ref        2.073e-01  3.439e-02   6.029 1.89e-09 ***
## capacity                  1.147e-02  1.482e-03   7.741 1.41e-14 ***
## I(1/dist_wrp^2)           1.399e-07  1.185e-07   1.181 0.237685    
## monthly_oil_1km           1.385e-04  4.413e-05   3.138 0.001719 ** 
## monthly_gas_1km           3.211e-04  2.507e-04   1.281 0.200372    
## active_1km               -4.537e-02  1.792e-02  -2.532 0.011400 *  
## daily_downwind_wrp        5.485e-02  3.489e-02   1.572 0.116065    
## elevation                -2.933e-02  1.071e-02  -2.738 0.006223 ** 
## EVI                      -6.902e-01  1.770e-01  -3.900 9.86e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     7.841  8.000 12.37
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.471  8.471 21.76
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 75.879 76.000 20.86
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 110/112
## R-sq.(adj) =  0.633   Deviance explained = 64.8%
## GCV = 0.13916  Scale est. = 0.13343   n = 2664
plot(h2s_da_model_e)

e <- getViz(h2s_da_model_e)
plot(sm(e, 2)) + l_fitRaster() + l_fitContour() + l_points()

# plotRGL(sm(d, 3), fix = c(`as.numeric(day)` = 0), residuals = F)
# 
# # try a plot using wireframe
# plot3d <- matrix(fitted(h2s_da_model_e), ncol = 20)
# wireframe(plot3d, drape=TRUE, colorkey=TRUE)
# Since feb 2022
h2s_da_model_f <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + capacity +
                          I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
                           k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                        I(1/dist_dc^2) + avg_temp + avg_hum + precip, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_f)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + daily_downwind_wrp + elevation + 
##     EVI + num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + 
##     precip
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -4.696e+00  7.111e-01  -6.603 4.88e-11 ***
## year2023                  4.435e-01  1.232e-01   3.600 0.000324 ***
## as.character(weekday)Mon -8.809e-02  2.550e-02  -3.454 0.000561 ***
## as.character(weekday)Sat -9.022e-02  2.540e-02  -3.552 0.000389 ***
## as.character(weekday)Sun -2.061e-01  2.542e-02  -8.108 7.92e-16 ***
## as.character(weekday)Thu -3.686e-02  2.542e-02  -1.450 0.147173    
## as.character(weekday)Tue -8.548e-03  2.527e-02  -0.338 0.735230    
## as.character(weekday)Wed  2.998e-02  2.555e-02   1.173 0.240810    
## wd_avg                    3.508e-04  9.030e-05   3.885 0.000105 ***
## ws_avg                   -7.855e-02  6.026e-03 -13.035  < 2e-16 ***
## daily_downwind_ref        1.567e-01  3.302e-02   4.744 2.21e-06 ***
## capacity                  1.003e-02  1.427e-03   7.031 2.63e-12 ***
## I(1/dist_wrp^2)           2.122e-07  1.133e-07   1.873 0.061128 .  
## monthly_oil_1km           1.611e-04  4.233e-05   3.805 0.000145 ***
## monthly_gas_1km          -1.306e-04  2.429e-04  -0.538 0.590730    
## active_1km               -9.257e-03  1.728e-02  -0.536 0.592319    
## daily_downwind_wrp        6.698e-02  3.329e-02   2.012 0.044311 *  
## elevation                -2.960e-02  1.022e-02  -2.897 0.003804 ** 
## EVI                      -7.131e-01  1.698e-01  -4.200 2.76e-05 ***
## num_odor_complaints       2.552e-02  1.252e-02   2.038 0.041693 *  
## I(1/dist_dc^2)            2.080e-05  9.122e-06   2.281 0.022659 *  
## avg_temp                  1.206e-02  2.685e-03   4.491 7.42e-06 ***
## avg_hum                  -5.599e-03  7.027e-04  -7.968 2.42e-15 ***
## precip                   -7.076e-02  3.637e-02  -1.945 0.051851 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     7.854  8.000 13.30
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.471  8.471 23.66
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 75.878 76.000 23.14
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 114/117
## R-sq.(adj) =  0.666   Deviance explained =   68%
## GCV = 0.12669  Scale est. = 0.12128   n = 2664
plot(h2s_da_model_f)

f <- getViz(h2s_da_model_f)
plot(sm(f, 2)) + l_fitRaster() + l_fitContour() + l_points()

# plotRGL(sm(d, 3), fix = c(`as.numeric(day)` = 0), residuals = F)
# 
# # try a plot using wireframe
# plot3d <- matrix(fitted(h2s_da_model_e), ncol = 20)
# wireframe(plot3d, drape=TRUE, colorkey=TRUE)
# Disaster only
h2s_da_model_g <- gam(H2S_daily_avg ~ month + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + capacity +
                          I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
                           k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                        I(1/dist_dc^2) + avg_temp + avg_hum + precip, 
                      data = daily_full %>% filter(year == '2021', month %in% c('10', '11', '12')))
summary(h2s_da_model_g)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ month + as.character(weekday) + wd_avg + ws_avg + 
##     daily_downwind_ref + capacity + I(1/dist_wrp^2) + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km + daily_downwind_wrp + elevation + EVI + num_odor_complaints + 
##     I(1/dist_dc^2) + avg_temp + avg_hum + precip
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.2213924  0.0934139  -2.370  0.01804 *  
## month11                  -4.2142854  1.7782277  -2.370  0.01804 *  
## month12                   1.1011938  0.4647034   2.370  0.01806 *  
## as.character(weekday)Mon -2.7259172  2.9255211  -0.932  0.35176    
## as.character(weekday)Sat -2.8891542  2.8298781  -1.021  0.30761    
## as.character(weekday)Sun -2.4956249  2.8514127  -0.875  0.38173    
## as.character(weekday)Thu -0.3339398  2.8872365  -0.116  0.90795    
## as.character(weekday)Tue -4.5461381  2.9772550  -1.527  0.12720    
## as.character(weekday)Wed -1.2618609  2.9362292  -0.430  0.66750    
## wd_avg                    0.0080694  0.0093628   0.862  0.38904    
## ws_avg                   -0.0079586  0.7510760  -0.011  0.99155    
## daily_downwind_ref       -2.7311456  2.7658732  -0.987  0.32375    
## capacity                  0.7818623  0.0804573   9.718  < 2e-16 ***
## I(1/dist_wrp^2)          -0.0001603  0.0000167  -9.600  < 2e-16 ***
## monthly_oil_1km          -0.0058962  0.0097159  -0.607  0.54413    
## monthly_gas_1km           0.0147016  0.0338346   0.435  0.66404    
## active_1km               -7.2923513  3.0769295  -2.370  0.01804 *  
## daily_downwind_wrp       -0.8402903  3.3511018  -0.251  0.80208    
## elevation                 2.7201937  0.8943859   3.041  0.00244 ** 
## EVI                      90.9526953 11.8445995   7.679 5.05e-14 ***
## num_odor_complaints       0.5226646  0.0988284   5.289 1.62e-07 ***
## I(1/dist_dc^2)            1.0786879  0.1100253   9.804  < 2e-16 ***
## avg_temp                  0.3827439  0.2785720   1.374  0.16987    
## avg_hum                   0.0272100  0.0665065   0.409  0.68256    
## precip                   -2.5913331  2.9921008  -0.866  0.38674    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.926  8.989 12.392
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 52.823 80.000  3.689
##                                                         p-value    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 109/114
## R-sq.(adj) =  0.501   Deviance explained =   55%
## GCV = 476.76  Scale est. = 429.63    n = 827
plot(h2s_da_model_g)

g <- getViz(h2s_da_model_g)
plot(sm(g, 1)) + l_fitRaster() + l_fitContour() + l_points()

# Exclude disaster
h2s_da_model_h <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + capacity +
                          I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
                           k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                        I(1/dist_dc^2) + avg_temp + avg_hum + precip, 
                      data = daily_full %>% filter(!(year == '2021' & month %in% c('10', '11', '12'))))
summary(h2s_da_model_h)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + daily_downwind_wrp + elevation + 
##     EVI + num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + 
##     precip
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -4.523e+00  1.295e+00  -3.493 0.000482 ***
## year2021                  3.333e-01  9.799e-02   3.402 0.000674 ***
## year2022                 -5.309e-01  2.081e-01  -2.551 0.010766 *  
## year2023                 -6.752e-01  2.155e-01  -3.133 0.001738 ** 
## as.character(weekday)Mon -5.458e-02  2.341e-02  -2.331 0.019771 *  
## as.character(weekday)Sat -8.351e-02  2.322e-02  -3.596 0.000325 ***
## as.character(weekday)Sun -1.702e-01  2.328e-02  -7.313 2.96e-13 ***
## as.character(weekday)Thu -1.154e-02  2.329e-02  -0.496 0.620154    
## as.character(weekday)Tue -1.368e-02  2.326e-02  -0.588 0.556374    
## as.character(weekday)Wed  1.616e-02  2.341e-02   0.690 0.490096    
## wd_avg                    1.512e-04  8.622e-05   1.754 0.079466 .  
## ws_avg                   -7.304e-02  5.490e-03 -13.304  < 2e-16 ***
## daily_downwind_ref        2.274e-02  2.396e-02   0.949 0.342705    
## capacity                  1.873e-02  2.376e-03   7.884 3.73e-15 ***
## I(1/dist_wrp^2)          -4.082e-06  1.369e-06  -2.983 0.002870 ** 
## monthly_oil_1km          -4.735e-05  1.804e-05  -2.625 0.008694 ** 
## monthly_gas_1km          -2.152e-04  7.046e-05  -3.054 0.002271 ** 
## active_1km               -4.449e-02  7.093e-03  -6.273 3.80e-10 ***
## daily_downwind_wrp        2.858e-02  2.842e-02   1.006 0.314673    
## elevation                 7.088e-02  1.153e-02   6.149 8.28e-10 ***
## EVI                       9.990e-01  2.838e-01   3.520 0.000435 ***
## num_odor_complaints       2.815e-02  1.119e-02   2.516 0.011891 *  
## I(1/dist_dc^2)            3.415e-02  8.835e-03   3.865 0.000112 ***
## avg_temp                  5.698e-03  2.075e-03   2.747 0.006037 ** 
## avg_hum                  -6.004e-03  5.830e-04 -10.299  < 2e-16 ***
## precip                   -3.164e-02  4.322e-02  -0.732 0.464151    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                           edf Ref.df     F
## s(as.numeric(month))                                     7.92      8 44.56
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.00      9 24.04
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 74.52     80 23.17
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 121/123
## R-sq.(adj) =  0.462   Deviance explained = 47.2%
## GCV = 0.24227  Scale est. = 0.23773   n = 6162
plot(h2s_da_model_h)

h <- getViz(h2s_da_model_h)
plot(sm(h, 2)) + l_fitRaster() + l_fitContour() + l_points()

# Disaster indicator
h2s_da_model_i <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + capacity +
                          I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
                           k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                        I(1/dist_dc^2) + avg_temp + avg_hum + precip + disaster, 
                      data = daily_full %>% mutate(disaster = if_else(year == '2021', 
                                                                      month %in% c('10', '11', '12'), 1, 0)))
summary(h2s_da_model_i)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + daily_downwind_wrp + elevation + 
##     EVI + num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + 
##     precip + disaster
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -3.573e+02  1.399e+01 -25.530  < 2e-16 ***
## year2021                  3.989e+00  1.666e+00   2.394  0.01667 *  
## year2022                  5.441e+00  2.075e+00   2.622  0.00875 ** 
## year2023                  4.519e+00  2.475e+00   1.826  0.06786 .  
## as.character(weekday)Mon -3.258e-01  3.764e-01  -0.866  0.38676    
## as.character(weekday)Sat -2.928e-01  3.732e-01  -0.785  0.43272    
## as.character(weekday)Sun -3.098e-01  3.734e-01  -0.830  0.40675    
## as.character(weekday)Thu -2.927e-01  3.745e-01  -0.782  0.43448    
## as.character(weekday)Tue -6.662e-01  3.743e-01  -1.780  0.07516 .  
## as.character(weekday)Wed -3.629e-01  3.757e-01  -0.966  0.33414    
## wd_avg                    1.248e-03  1.348e-03   0.926  0.35423    
## ws_avg                   -7.136e-02  8.683e-02  -0.822  0.41120    
## daily_downwind_ref       -1.815e-02  3.815e-01  -0.048  0.96205    
## capacity                  6.881e-01  2.416e-02  28.477  < 2e-16 ***
## I(1/dist_wrp^2)          -2.730e-04  1.995e-05 -13.687  < 2e-16 ***
## monthly_oil_1km           3.638e-04  2.667e-04   1.364  0.17256    
## monthly_gas_1km           7.303e-04  9.757e-04   0.748  0.45420    
## active_1km                9.149e-02  9.378e-02   0.976  0.32928    
## daily_downwind_wrp        4.554e-01  4.533e-01   1.005  0.31508    
## elevation                 2.536e+00  1.487e-01  17.053  < 2e-16 ***
## EVI                       7.930e+01  3.021e+00  26.253  < 2e-16 ***
## num_odor_complaints       9.522e-01  2.756e-02  34.552  < 2e-16 ***
## I(1/dist_dc^2)            2.030e+00  1.117e-01  18.171  < 2e-16 ***
## avg_temp                  1.727e-02  3.066e-02   0.563  0.57323    
## avg_hum                  -2.637e-04  8.511e-03  -0.031  0.97529    
## precip                   -4.241e-02  5.876e-01  -0.072  0.94246    
## disaster                  3.211e+00  1.026e+00   3.130  0.00176 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(as.numeric(month))                                     1.783  8.000  0.327
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.957  8.997 94.223
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.836 80.000  6.382
##                                                         p-value    
## s(as.numeric(month))                                      0.186    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 122/124
## R-sq.(adj) =  0.331   Deviance explained = 34.2%
## GCV =  70.74  Scale est. = 69.57     n = 6989
plot(h2s_da_model_i)

i <- getViz(h2s_da_model_i)
plot(sm(i, 2)) + l_fitRaster() + l_fitContour() + l_points()

plotRGL(sm(i, 2), fix = c(`as.numeric(day)` = 0), residuals = F)

# try a plot using wireframe
# plot3d <- matrix(fitted(h2s_da_model_i), ncol = 20)
# wireframe(plot3d, drape=TRUE, colorkey=TRUE)
# Everything
h2s_da_model_j <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + capacity +
                          I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
                           k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                        I(1/dist_dc^2) + avg_temp + avg_hum + precip, 
                      data = daily_full)
summary(h2s_da_model_j)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + daily_downwind_wrp + elevation + 
##     EVI + num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + 
##     precip
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -3.496e+02  1.385e+01 -25.237   <2e-16 ***
## year2021                  7.052e-01  1.335e+00   0.528   0.5974    
## year2022                  1.405e+00  1.689e+00   0.832   0.4055    
## year2023                  6.499e-01  2.180e+00   0.298   0.7656    
## as.character(weekday)Mon -3.333e-01  3.772e-01  -0.884   0.3769    
## as.character(weekday)Sat -2.687e-01  3.740e-01  -0.719   0.4724    
## as.character(weekday)Sun -2.956e-01  3.742e-01  -0.790   0.4297    
## as.character(weekday)Thu -3.134e-01  3.753e-01  -0.835   0.4036    
## as.character(weekday)Tue -6.738e-01  3.751e-01  -1.796   0.0725 .  
## as.character(weekday)Wed -3.640e-01  3.765e-01  -0.967   0.3337    
## wd_avg                    1.248e-03  1.350e-03   0.925   0.3551    
## ws_avg                   -7.657e-02  8.704e-02  -0.880   0.3790    
## daily_downwind_ref       -7.636e-02  3.820e-01  -0.200   0.8416    
## capacity                  6.803e-01  2.395e-02  28.403   <2e-16 ***
## I(1/dist_wrp^2)          -2.625e-04  1.033e-05 -25.420   <2e-16 ***
## monthly_oil_1km           4.951e-04  2.608e-04   1.899   0.0577 .  
## monthly_gas_1km           3.955e-04  9.490e-04   0.417   0.6769    
## active_1km                1.023e-01  9.493e-02   1.077   0.2814    
## daily_downwind_wrp        3.830e-01  4.538e-01   0.844   0.3986    
## elevation                 2.481e+00  1.435e-01  17.287   <2e-16 ***
## EVI                       7.803e+01  2.932e+00  26.614   <2e-16 ***
## num_odor_complaints       9.839e-01  2.678e-02  36.738   <2e-16 ***
## I(1/dist_dc^2)            1.967e+00  7.384e-02  26.644   <2e-16 ***
## avg_temp                  2.239e-02  3.056e-02   0.733   0.4638    
## avg_hum                  -1.267e-03  8.550e-03  -0.148   0.8822    
## precip                   -1.722e-01  5.868e-01  -0.293   0.7692    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(as.numeric(month))                                     2.676      8  0.630
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.000      9 91.536
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 68.173     80  5.593
##                                                         p-value    
## s(as.numeric(month))                                     0.0709 .  
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 121/123
## R-sq.(adj) =  0.328   Deviance explained = 33.8%
## GCV = 70.933  Scale est. = 69.879    n = 6989
plot(h2s_da_model_j)

knitr::kable(tibble(Model = c('Since Feb 2022',
                              'Disaster Only',
                              'Exclude Disaster',
                              'Everything w Disaster Indicator',
                              'Everything w.o Disaster Indicator'),
                    'R-Sq' = c(round(summary(h2s_da_model_f)$r.sq, 2),
                                    round(summary(h2s_da_model_g)$r.sq, 2),
                                    round(summary(h2s_da_model_h)$r.sq, 2),
                                    round(summary(h2s_da_model_i)$r.sq, 2),
                                    round(summary(h2s_da_model_j)$r.sq, 2))))
Model R-Sq
Since Feb 2022 0.67
Disaster Only 0.50
Exclude Disaster 0.46
Everything w Disaster Indicator 0.33
Everything w.o Disaster Indicator 0.33

80/20 Cross Validation on Daily Average models

result <- tibble(Model = character(),
                 '80/20 Train R-Sq' = numeric(),
                 '80/20 Test R-Sq' = numeric())

validation_result_8020 <- tibble(Model = character(),
                            'Coef' = numeric(),
                            'R-Sq' = numeric())

predictors <- c('H2S_daily_avg', 'month', 'year', 'weekday', 'wd_avg', 'ws_avg', 'daily_downwind_ref', 'dist_wrp', 
                'mon_utm_x', 'mon_utm_y', 'day', 'monthly_oil_1km', 'monthly_gas_1km', 
                'active_1km', 'daily_downwind_wrp', 'elevation', 'EVI', 'num_odor_complaints',
                'dist_dc', 'capacity', 'avg_temp', 'avg_hum', 'precip') 

set.seed(123)

# model 1: since Feb 2022
train <- daily_full %>% 
  filter(day >= '2022-02-01') %>% 
  select(all_of(predictors)) %>% 
  filter(complete.cases(.)) %>% 
  slice_sample(prop = 0.8)

test <- anti_join(daily_full %>% 
                  filter(day >= '2022-02-01') %>% 
                  select(all_of(predictors)) %>% 
                  filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, dist_dc, capacity, avg_temp, avg_hum, precip)`
h2s_da_model_f2 <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)

predictions <- predict(h2s_da_model_f2, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f2)$n - 1)/
  (summary(h2s_da_model_f2)$n - summary(h2s_da_model_f2)$np - 1)

result <- rbind(result, tibble(Model = 'Since Feb 2022',
                               '80/20 Train R-Sq' = summary(h2s_da_model_f2)$r.sq,
                               '80/20 Test R-Sq' = adj_r2_test))

f2_obs_vs_pred_plot <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Since 2022 GAM') +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result_8020 <- rbind(validation_result_8020, 
                           tibble(Model = 'Since Feb 2022',
                                  'Coef' = summary(lm(test %>% pull(H2S_daily_avg) ~
                                                        predictions))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(test %>% pull(H2S_daily_avg) ~
                                                        predictions))$r.squared))

# Model 2: Disaster (Oct-Dec 2021) only
train <- daily_full %>% 
  filter(year == '2021', month %in% c('10', '11', '12')) %>% 
  select(all_of(predictors)) %>% 
  filter(complete.cases(.)) %>% 
  slice_sample(prop = 0.8)

test <- anti_join(daily_full %>% 
                  filter(year == '2021', month %in% c('10', '11', '12')) %>% 
                  select(all_of(predictors)) %>% 
                  filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, dist_dc, capacity, avg_temp, avg_hum, precip)`
h2s_da_model_f3 <- gam(H2S_daily_avg ~ month + as.character(weekday) +
                       wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                       s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                       te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                          k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                       monthly_oil_1km + monthly_gas_1km + active_1km + 
                       daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                       capacity + dist_dc + avg_temp + avg_hum + precip, 
                      data = train)

predictions <- predict(h2s_da_model_f3, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f3)$n - 1)/
  (summary(h2s_da_model_f3)$n - summary(h2s_da_model_f3)$np - 1)

result <- rbind(result, tibble(Model = 'Disaster Only',
                               '80/20 Train R-Sq' = summary(h2s_da_model_f3)$r.sq,
                               '80/20 Test R-Sq' = adj_r2_test))

f3_obs_vs_pred_plot <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Disaster Only GAM') +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result_8020 <- rbind(validation_result_8020, 
                           tibble(Model = 'Disaster Only',
                                  'Coef' = summary(lm(test %>% pull(H2S_daily_avg) ~
                                                        predictions))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(test %>% pull(H2S_daily_avg) ~
                                                        predictions))$r.squared))

# Model 3: Excluding Disaster
train <- daily_full %>% 
  filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>% 
  select(all_of(predictors)) %>% 
  filter(complete.cases(.)) %>% 
  slice_sample(prop = 0.8)

test <- anti_join(daily_full %>% 
                  filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>% 
                  select(all_of(predictors)) %>% 
                  filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, dist_dc, capacity, avg_temp, avg_hum, precip)`
h2s_da_model_f4 <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)

predictions <- predict(h2s_da_model_f4, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f4)$n - 1)/
  (summary(h2s_da_model_f4)$n - summary(h2s_da_model_f4)$np - 1)

result <- rbind(result, tibble(Model = 'Exclude Disaster',
                               '80/20 Train R-Sq' = summary(h2s_da_model_f4)$r.sq,
                               '80/20 Test R-Sq' = adj_r2_test))

# Model 4: Everything with Disaster Indicator
train <- daily_full %>% 
  select(all_of(predictors)) %>% 
  mutate(disaster = if_else(year == '2021' & month %in% c('10', '11', '12'), 1, 0)) %>% 
  filter(complete.cases(.)) 

train <- rbind(
  train %>% filter(disaster==1) %>% slice_sample(prop = 0.8),
  train %>% filter(disaster==0) %>% slice_sample(prop = 0.8)
)

test <- anti_join(daily_full %>% 
                  select(all_of(predictors)) %>% 
                    mutate(disaster = if_else(year == '2021' & month %in% c('10', '11', '12'), 1, 0)) %>% 
                  filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, dist_dc, capacity, avg_temp, avg_hum, precip, disaster)`
h2s_da_model_f5 <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         disaster +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)

predictions <- predict(h2s_da_model_f5, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f5)$n - 1)/
  (summary(h2s_da_model_f5)$n - summary(h2s_da_model_f5)$np - 1)

result <- rbind(result, tibble(Model = 'Everything w. Disaster Indicator',
                               '80/20 Train R-Sq' = summary(h2s_da_model_f5)$r.sq,
                               '80/20 Test R-Sq' = adj_r2_test))

# Model 5: Everything
train <- daily_full %>% 
  select(all_of(predictors)) %>% 
  filter(complete.cases(.)) %>% 
  slice_sample(prop = 0.8)

train <- rbind(
  train %>% filter(year == '2021' & month %in% c('10', '11', '12')) %>% slice_sample(prop = 0.8),
  train %>% filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>% slice_sample(prop = 0.8)
  )

test <- anti_join(daily_full %>% 
                  select(all_of(predictors)) %>% 
                  filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, dist_dc, capacity, avg_temp, avg_hum, precip)`
h2s_da_model_f6 <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)

predictions <- predict(h2s_da_model_f6, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f6)$n - 1)/
  (summary(h2s_da_model_f5)$n - summary(h2s_da_model_f6)$np - 1)

result <- rbind(result, tibble(Model = 'Everything w.o Disaster Indicator',
                               '80/20 Train R-Sq' = summary(h2s_da_model_f6)$r.sq,
                               '80/20 Test R-Sq' = adj_r2_test))

f6_obs_vs_pred_plot <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Everything w.o Disaster Indicator Gam') +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
f6_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(NA, 4), ylim = c(NA, 4)) +
                        theme_bw()

validation_result_8020 <- rbind(validation_result_8020, 
                           tibble(Model = 'Everything',
                                  'Coef' = summary(lm(test %>% pull(H2S_daily_avg) ~
                                                        predictions))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(test %>% pull(H2S_daily_avg) ~
                                                        predictions))$r.squared))
knitr::kable(tibble(Model = c('Since Feb 2022',
                              'Disaster Only',
                              'Exclude Disaster',
                              'Everything w Disaster Indicator',
                              'Everything w.o Disaster Indicator'),
                    'R-Sq' = c(round(summary(h2s_da_model_f)$r.sq, 2),
                                    round(summary(h2s_da_model_g)$r.sq, 2),
                                    round(summary(h2s_da_model_h)$r.sq, 2),
                                    round(summary(h2s_da_model_i)$r.sq, 2),
                                    round(summary(h2s_da_model_j)$r.sq, 2))) %>% 
               left_join(result, join_by(Model)))
Model R-Sq 80/20 Train R-Sq 80/20 Test R-Sq
Since Feb 2022 0.67 0.6538821 0.6823609
Disaster Only 0.50 0.5175137 0.2921882
Exclude Disaster 0.46 0.5189897 0.2950658
Everything w Disaster Indicator 0.33 NA NA
Everything w.o Disaster Indicator 0.33 0.3372143 0.4443146

Cross Validation on each monitor

result_cv <- tibble(Model = character(),
                 'CV Avg Train R-Sq' = numeric(),
                 'CV Avg Test R-Sq' = numeric())

# model 1: since Feb 2022
post2022feb <- daily_full %>% 
  filter(day >= '2022-02-01') %>% 
  select(all_of(predictors), Monitor) %>% 
  filter(complete.cases(.))

monitors <- unique(post2022feb$Monitor)

cv1_result <- tibble(Monitor = character(),
                     'Train Adj-R2' = numeric(),
                     'Train N' = numeric(),
                     'Test Adj-R2' = numeric(),
                     'Test B' = numeric()
                     )

for (i in 1:length(monitors)) {
  train <- post2022feb %>%
    filter(Monitor != monitors[i])
  
  test <- post2022feb %>%
    filter(Monitor == monitors[i])
  
  h2s_da_model_f2b <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)
  
  predictions <- predict(h2s_da_model_f2b, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f2b)$n - 1)/
    (summary(h2s_da_model_f2b)$n - summary(h2s_da_model_f2b)$np - 1)
  
  cv1_result <- rbind(cv1_result, tibble(monitors[i],
                                         summary(h2s_da_model_f2b)$r.sq, 
                                         summary(h2s_da_model_f2b)$n, 
                                         adj_r2_test, 
                                         nrow(test)))
}

result_cv <- rbind(result_cv, tibble(tibble(Model = 'Since Feb 2022',
                                           'CV Avg Train R-Sq' = mean(cv1_result$`summary(h2s_da_model_f2b)$r.sq`),
                                           'CV Avg Test R-Sq' = mean(cv1_result$adj_r2_test))))

# model 2: Disaster only
disaster <- daily_full %>% 
  filter(year == '2021', month %in% c('10', '11', '12')) %>% 
  select(all_of(predictors), Monitor) %>% 
  filter(complete.cases(.))

monitors <- unique(disaster$Monitor)

cv2_result <- tibble(Monitor = character(),
                     'Train Adj-R2' = numeric(),
                     'Train N' = numeric(),
                     'Test Adj-R2' = numeric(),
                     'Test B' = numeric()
                     )

for (i in 1:length(monitors)) {
  train <- disaster %>%
    filter(Monitor != monitors[i])
  
  test <- disaster %>%
    filter(Monitor == monitors[i])
  
  h2s_da_model_f3b <- gam(H2S_daily_avg~ month + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)
  
  predictions <- predict(h2s_da_model_f3b, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f3b)$n - 1)/
    (summary(h2s_da_model_f3b)$n - summary(h2s_da_model_f3b)$np - 1)
  
  cv2_result <- rbind(cv2_result, tibble(monitors[i],
                                         summary(h2s_da_model_f3b)$r.sq, 
                                         summary(h2s_da_model_f3b)$n, 
                                         adj_r2_test, 
                                         nrow(test)))
}

result_cv <- rbind(result_cv, tibble(tibble(Model = 'Disaster Only',
                                           'CV Avg Train R-Sq' = mean(cv2_result$`summary(h2s_da_model_f3b)$r.sq`),
                                           'CV Avg Test R-Sq' = mean(cv2_result$adj_r2_test))))

# model 3: Exclude Disaster
exclude_disaster <- daily_full %>% 
  filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>% 
  select(all_of(predictors), Monitor) %>% 
  filter(complete.cases(.))

monitors <- unique(exclude_disaster$Monitor)

cv3_result <- tibble(Monitor = character(),
                     'Train Adj-R2' = numeric(),
                     'Train N' = numeric(),
                     'Test Adj-R2' = numeric(),
                     'Test B' = numeric()
                     )

for (i in 1:length(monitors)) {
  train <- exclude_disaster %>%
    filter(Monitor != monitors[i])
  
  test <- exclude_disaster %>%
    filter(Monitor == monitors[i])
  
  h2s_da_model_f4b <- gam(H2S_daily_avg~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)
  
  predictions <- predict(h2s_da_model_f4b, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f4b)$n - 1)/
    (summary(h2s_da_model_f4b)$n - summary(h2s_da_model_f4b)$np - 1)
  
  cv3_result <- rbind(cv3_result, tibble(monitors[i],
                                         summary(h2s_da_model_f4b)$r.sq, 
                                         summary(h2s_da_model_f4b)$n, 
                                         adj_r2_test, 
                                         nrow(test)))
}

result_cv <- rbind(result_cv, tibble(tibble(Model = 'Exclude Disaster',
                                           'CV Avg Train R-Sq' = mean(cv3_result$`summary(h2s_da_model_f4b)$r.sq`),
                                           'CV Avg Test R-Sq' = mean(cv3_result$adj_r2_test))))

# model 4: Everything w Disaster Indicator 
full_complete <- daily_full %>% 
  select(all_of(predictors), Monitor) %>% 
  filter(complete.cases(.)) %>%
  mutate(disaster = if_else(year == '2021' &  month %in% c('10', '11', '12'), 1, 0))

monitors <- unique(full_complete$Monitor)

cv4_result <- tibble(Monitor = character(),
                     'Train Adj-R2' = numeric(),
                     'Train N' = numeric(),
                     'Test Adj-R2' = numeric(),
                     'Test B' = numeric()
                     )

for (i in 1:length(monitors)) {
  train <- full_complete %>%
    filter(Monitor != monitors[i])
  
  test <- full_complete %>%
    filter(Monitor == monitors[i])
  
  h2s_da_model_f5b <- gam(H2S_daily_avg~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         disaster + capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)
  
  predictions <- predict(h2s_da_model_f5b, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f5b)$n - 1)/
    (summary(h2s_da_model_f5b)$n - summary(h2s_da_model_f5b)$np - 1)
  
  cv4_result <- rbind(cv4_result, tibble(monitors[i],
                                         summary(h2s_da_model_f5b)$r.sq, 
                                         summary(h2s_da_model_f5b)$n, 
                                         adj_r2_test, 
                                         nrow(test)))
}

result_cv <- rbind(result_cv, tibble(tibble(Model = 'Everything w. Disaster Indicator',
                                           'CV Avg Train R-Sq' = mean(cv4_result$`summary(h2s_da_model_f5b)$r.sq`),
                                           'CV Avg Test R-Sq' = mean(cv4_result$adj_r2_test))))

# model 5: Everything w.o Disaster Indicator 
full_complete <- daily_full %>% 
  select(all_of(predictors), Monitor) %>% 
  filter(complete.cases(.))

monitors <- unique(full_complete$Monitor)

cv5_result <- tibble(Monitor = character(),
                     'Train Adj-R2' = numeric(),
                     'Train N' = numeric(),
                     'Test Adj-R2' = numeric(),
                     'Test B' = numeric()
                     )

for (i in 1:length(monitors)) {
  train <- full_complete %>%
    filter(Monitor != monitors[i])
  
  test <- full_complete %>%
    filter(Monitor == monitors[i])
  
  h2s_da_model_f6b <- gam(H2S_daily_avg~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)
  
  predictions <- predict(h2s_da_model_f6b, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f6b)$n - 1)/
    (summary(h2s_da_model_f6b)$n - summary(h2s_da_model_f6b)$np - 1)
  
  cv5_result <- rbind(cv5_result, tibble(monitors[i],
                                         summary(h2s_da_model_f6b)$r.sq, 
                                         summary(h2s_da_model_f6b)$n, 
                                         adj_r2_test, 
                                         nrow(test)))
}

result_cv <- rbind(result_cv, tibble(tibble(Model = 'Everything w.o Disaster Indicator',
                                           'CV Avg Train R-Sq' = mean(cv5_result$`summary(h2s_da_model_f6b)$r.sq`),
                                           'CV Avg Test R-Sq' = mean(cv5_result$adj_r2_test))))

cv1_result
cv2_result
cv3_result
cv4_result
cv5_result

10-fold CV

set.seed(1)
random_seeds <- floor(runif(5, min=0, max=1)*100)
result_10cv <- tibble(Model = character(),
                 '10CV AVG Train R-Sq' = numeric(),
                 '10CV AVG Test R-Sq' = numeric())

validation_result_10cv <- tibble(Model = character(),
                            'Coef' = numeric(),
                            'R-Sq' = numeric())

# model 1: since Feb 2022
obs_10cv_sincefeb2022 <- c()
pred_10cv_sincefeb2022 <- c()
adj_r2_sincefeb2022 <- c()
adj_r2_test_sincefeb2022 <- c()

sincefeb2022 <- daily_full %>% 
filter(day >= '2022-02-01') %>% 
select(all_of(predictors)) %>% 
filter(complete.cases(.))

set.seed(random_seeds[1])
folds <- createFolds(seq(1, nrow(sincefeb2022)), k = 10)
  
for (fold in seq(1, 10)) {
  test <- sincefeb2022[folds[[fold]], ]
  train <- anti_join(sincefeb2022, test, by = join_by(dist_dc, day))
  
  h2s_da_model_10cv <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                           wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                           s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                           te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                              k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                           monthly_oil_1km + monthly_gas_1km + active_1km + 
                           daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                           capacity + dist_dc + avg_temp + avg_hum + precip, 
                          data = train)
  
  predictions <- predict(h2s_da_model_10cv, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_10cv)$n - 1)/
    (summary(h2s_da_model_10cv)$n - summary(h2s_da_model_10cv)$np - 1)
  
  obs_10cv_sincefeb2022 <- append(obs_10cv_sincefeb2022, test %>% pull(H2S_daily_avg))
  pred_10cv_sincefeb2022 <- append(pred_10cv_sincefeb2022, predictions)
  adj_r2_sincefeb2022 <- append(adj_r2_sincefeb2022, summary(h2s_da_model_10cv)$r.sq)
  adj_r2_test_sincefeb2022 <- append(adj_r2_sincefeb2022, adj_r2_test)
}

result_10cv <- rbind(result_10cv, tibble(Model = 'Since Feb 2022',
                               '10CV AVG Train R-Sq' = mean(adj_r2_sincefeb2022),
                               '10CV AVG Test R-Sq' = mean(adj_r2_test_sincefeb2022)))

f2_10cv_obs_vs_pred_plot <- ggplot(tibble(obs = obs_10cv_sincefeb2022, pred = pred_10cv_sincefeb2022),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Since 2022 GAM 10CV') +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result_10cv <- rbind(validation_result_10cv, 
                           tibble(Model = 'Since Feb 2022',
                                  'Coef' = summary(lm(obs_10cv_sincefeb2022 ~
                                                        pred_10cv_sincefeb2022))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(obs_10cv_sincefeb2022 ~
                                                        pred_10cv_sincefeb2022))$r.squared))

# Model 2: Disaster (Oct-Dec 2021) only
obs_10cv_disaster <- c()
pred_10cv_disaster <- c()
adj_r2_disaster <- c()
adj_r2_test_disaster <- c()

disaster <- daily_full %>% 
  filter(year == '2021', month %in% c('10', '11', '12')) %>% 
  select(all_of(predictors)) %>% 
  filter(complete.cases(.))

set.seed(random_seeds[2])
folds <- createFolds(seq(1, nrow(disaster)), k = 10)

for (fold in seq(1, 10)) {
  test <- disaster[folds[[fold]], ]
  train <- anti_join(disaster, test, by = join_by(dist_dc, day))
  
  h2s_da_model_10cv <- gam(H2S_daily_avg ~ month + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)
  
  predictions <- predict(h2s_da_model_10cv, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_10cv)$n - 1)/
    (summary(h2s_da_model_10cv)$n - summary(h2s_da_model_10cv)$np - 1)
  
  obs_10cv_disaster <- append(obs_10cv_disaster, test %>% pull(H2S_daily_avg))
  pred_10cv_disaster <- append(pred_10cv_disaster, predictions)
  adj_r2_disaster <- append(adj_r2_disaster, summary(h2s_da_model_10cv)$r.sq)
  adj_r2_test_disaster <- append(adj_r2_disaster, adj_r2_test)
}

result_10cv <- rbind(result_10cv, tibble(Model = 'Disaster Only',
                               '10CV AVG Train R-Sq' = mean(adj_r2_disaster),
                               '10CV AVG Test R-Sq' = mean(adj_r2_test_disaster)))

f3_10cv_obs_vs_pred_plot <- ggplot(tibble(obs = obs_10cv_disaster, pred = pred_10cv_disaster),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Disaster Only GAM 10CV') +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result_10cv <- rbind(validation_result_10cv, 
                           tibble(Model = 'Disaster Only',
                                  'Coef' = summary(lm(obs_10cv_disaster ~
                                                        pred_10cv_disaster))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(obs_10cv_disaster ~
                                                        pred_10cv_disaster))$r.squared))

# Model 3: Excluding Disaster
obs_10cv_excl_disaster <- c()
pred_10cv_excl_disaster <- c()
adj_r2_excl_disaster <- c()
adj_r2_test_excl_disaster <- c()

excl_disaster <- daily_full %>% 
  filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>% 
  select(all_of(predictors)) %>% 
  filter(complete.cases(.))

set.seed(random_seeds[3])
folds <- createFolds(seq(1, nrow(excl_disaster)), k = 10)

for (fold in seq(1, 10)) {
  test <- excl_disaster[folds[[fold]], ]
  train <- anti_join(excl_disaster, test, by = join_by(dist_dc, day))
  
  h2s_da_model_10cv <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)
  
  predictions <- predict(h2s_da_model_10cv, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_10cv)$n - 1)/
    (summary(h2s_da_model_10cv)$n - summary(h2s_da_model_10cv)$np - 1)
  
  obs_10cv_excl_disaster <- append(obs_10cv_excl_disaster, test %>% pull(H2S_daily_avg))
  pred_10cv_excl_disaster <- append(pred_10cv_excl_disaster, predictions)
  adj_r2_excl_disaster <- append(adj_r2_excl_disaster, summary(h2s_da_model_10cv)$r.sq)
  adj_r2_test_excl_disaster <- append(adj_r2_excl_disaster, adj_r2_test)
}

result_10cv <- rbind(result_10cv, tibble(Model = 'Exclude Disaster',
                               '10CV AVG Train R-Sq' = mean(adj_r2_excl_disaster),
                               '10CV AVG Test R-Sq' = mean(adj_r2_test_excl_disaster)))

# Model 4: Everything with Disaster Indicator
obs_10cv_disaster_ind <- c()
pred_10cv_disaster_ind <- c()
adj_r2_disaster_ind <- c()
adj_r2_test_disaster_ind <- c()

disaster_ind <- daily_full %>% 
  select(all_of(predictors)) %>% 
  mutate(disaster = if_else(year == '2021' & month %in% c('10', '11', '12'), 1, 0)) %>% 
  filter(complete.cases(.)) 

set.seed(random_seeds[4])
folds_1 <- createFolds(seq(1, nrow(disaster_ind %>% filter(disaster==1))), k = 10)
folds_0 <- createFolds(seq(1, nrow(disaster_ind %>% filter(disaster==0))), k = 10)

for (fold in seq(1, 10)) {
  test <- rbind(disaster_ind[folds_1[[fold]], ], disaster_ind[folds_0[[fold]], ])
  
  train <- anti_join(disaster_ind, test, by = join_by(dist_dc, day))
  
  h2s_da_model_10cv <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)
  
  predictions <- predict(h2s_da_model_10cv, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_10cv)$n - 1)/
    (summary(h2s_da_model_10cv)$n - summary(h2s_da_model_10cv)$np - 1)
  
  obs_10cv_disaster_ind <- append(obs_10cv_disaster_ind, test %>% pull(H2S_daily_avg))
  pred_10cv_disaster_ind <- append(pred_10cv_disaster_ind, predictions)
  adj_r2_disaster_ind <- append(adj_r2_disaster_ind, summary(h2s_da_model_10cv)$r.sq)
  adj_r2_test_disaster_ind <- append(adj_r2_disaster_ind, adj_r2_test)
}

result_10cv <- rbind(result_10cv, tibble(Model = 'Everything w. Disaster Indicator',
                               '10CV AVG Train R-Sq' = mean(adj_r2_disaster_ind),
                               '10CV AVG Test R-Sq' = mean(adj_r2_test_disaster_ind)))


# Model 5: Everything
obs_10cv_everything <- c()
pred_10cv_everything <- c()
adj_r2_everything <- c()
adj_r2_test_everything <- c()

everything <- daily_full %>% 
  select(all_of(predictors)) %>% 
  filter(complete.cases(.))

set.seed(random_seeds[5])
folds <- createFolds(seq(1, nrow(everything), k = 10))
## Warning: In seq.default(1, nrow(everything), k = 10) :
##  extra argument 'k' will be disregarded
for (fold in seq(1, 10)) {
  test <- everything[folds[[fold]], ]
  train <- anti_join(everything, test, by = join_by(dist_dc, day))
  
  h2s_da_model_10cv <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)
  
  predictions <- predict(h2s_da_model_10cv, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_10cv)$n - 1)/
    (summary(h2s_da_model_10cv)$n - summary(h2s_da_model_10cv)$np - 1)
  
  obs_10cv_everything <- append(obs_10cv_everything, test %>% pull(H2S_daily_avg))
  pred_10cv_everything <- append(pred_10cv_everything, predictions)
  adj_r2_everything <- append(adj_r2_everything, summary(h2s_da_model_10cv)$r.sq)
  adj_r2_test_everything <- append(adj_r2_everything, adj_r2_test)
}

result_10cv <- rbind(result_10cv, tibble(Model = 'Everything w.o Disaster Indicator',
                               '10CV AVG Train R-Sq' = mean(adj_r2_everything),
                               '10CV AVG Test R-Sq' = mean(adj_r2_test_everything)))

f6_10cv_obs_vs_pred_plot <- ggplot(tibble(obs = obs_10cv_everything, pred = pred_10cv_everything),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Everything w.o Disaster Indicator GAM CV') +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
f6_10cv_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = obs_10cv_everything, pred = pred_10cv_everything),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(NA, 15), ylim = c(NA, 50)) +
                        theme_bw()

validation_result_10cv <- rbind(validation_result_10cv, 
                           tibble(Model = 'Everything',
                                  'Coef' = summary(lm(obs_10cv_everything ~
                                                        pred_10cv_everything))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(obs_10cv_everything ~
                                                        pred_10cv_everything))$r.squared))

GAM result

knitr::kable(tibble(Model = c('Since Feb 2022',
                              'Disaster Only',
                              'Exclude Disaster',
                              'Everything w. Disaster Indicator',
                              'Everything w.o Disaster Indicator'),
                    'Train R-sq' = c(round(summary(h2s_da_model_f)$r.sq, 2),
                                    round(summary(h2s_da_model_g)$r.sq, 2),
                                    round(summary(h2s_da_model_h)$r.sq, 2),
                                    round(summary(h2s_da_model_i)$r.sq, 2),
                                    round(summary(h2s_da_model_j)$r.sq, 2))) %>% 
               left_join(result, join_by(Model)) %>%
               left_join(result_cv, join_by(Model)) %>%
               left_join(result_10cv, join_by(Model)),
             digits = 3)
Model Train R-sq 80/20 Train R-Sq 80/20 Test R-Sq CV Avg Train R-Sq CV Avg Test R-Sq 10CV AVG Train R-Sq 10CV AVG Test R-Sq
Since Feb 2022 0.67 0.654 0.682 0.667 0.236 0.664 0.665
Disaster Only 0.50 0.518 0.292 0.671 0.094 0.497 0.439
Exclude Disaster 0.46 0.519 0.295 0.477 0.085 0.463 0.446
Everything w. Disaster Indicator 0.33 0.386 0.612 0.498 0.115 0.321 0.327
Everything w.o Disaster Indicator 0.33 0.337 0.444 0.496 0.107 0.329 0.341

Observed vs. Predicted plots

ggarrange(f2_obs_vs_pred_plot, 
          f3_obs_vs_pred_plot,
          ggarrange(f6_obs_vs_pred_plot, f6_obs_vs_pred_plot_zoom, ncol = 2, labels = c("3", "4")), 
          labels = c("1", "2"),
          nrow = 3)

knitr::kable(validation_result_8020, digits = 3)
Model Coef R-Sq
Since Feb 2022 0.966 0.700
Disaster Only 0.761 0.414
Everything 1.201 0.321
ggarrange(f2_10cv_obs_vs_pred_plot, 
          f3_10cv_obs_vs_pred_plot,
          ggarrange(f6_10cv_obs_vs_pred_plot, f6_10cv_obs_vs_pred_plot_zoom, ncol = 2, labels = c("3", "4")), 
          labels = c("1", "2"),
          nrow = 3)

knitr::kable(validation_result_10cv, digits = 3)
Model Coef R-Sq
Since Feb 2022 0.980 0.649
Disaster Only 0.951 0.456
Everything 0.865 0.271

Monthly

h2s_ma_model_a <- gam(H2S_monthly_average~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery, data = full_data)
summary(h2s_ma_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_monthly_average ~ s(as.numeric(month), bs = "cc") + year + 
##     wd_sec + ws + I(1/MinDist^2) + Refinery
## 
## Parametric coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       1.469e+00  3.010e-01   4.879 1.07e-06 ***
## year2021                          4.569e+00  1.704e-01  26.817  < 2e-16 ***
## year2022                         -4.981e+00  1.895e-01 -26.285  < 2e-16 ***
## year2023                         -2.272e+00  2.389e-01  -9.510  < 2e-16 ***
## wd_secQ2                         -9.341e-02  1.781e-01  -0.524      0.6    
## wd_secQ3                          1.662e+00  1.811e-01   9.179  < 2e-16 ***
## wd_secQ4                         -8.799e-01  1.712e-01  -5.140 2.74e-07 ***
## ws                                7.904e-02  1.924e-02   4.109 3.98e-05 ***
## I(1/MinDist^2)                    9.127e+05  1.544e+05   5.911 3.41e-09 ***
## RefineryMarathon (Carson)         2.223e+01  2.465e-01  90.170  < 2e-16 ***
## RefineryMarathon (Wilmington)     3.672e+00  2.867e-01  12.808  < 2e-16 ***
## RefineryPhillips 66 (Wilmington)  2.444e+00  2.532e-01   9.652  < 2e-16 ***
## RefineryTorrance Refinery        -1.154e+00  2.332e-01  -4.949 7.48e-07 ***
## RefineryValero Refinery           5.945e+00  2.804e-01  21.202  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                      edf Ref.df    F p-value    
## s(as.numeric(month))   8      8 5716  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0357   Deviance explained = 3.57%
## GCV = 5419.9  Scale est. = 5419.9    n = 2056378
plot(h2s_ma_model_a)

# h2s_ma_model_b <- gam(H2S_monthly_average~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery+s(latitude, longitude, bs='tp', k=10), data = full_data)
# summary(h2s_ma_model_b)
# plot(h2s_ma_model_b)

XGBoost Since 2022 Feb

validation_result <- tibble(Model = character(),
                                 'Coef' = character(),
                                 'R-Sq' = numeric())

xgb_result <- tibble(Model = character(),
                 'R-Sq' = numeric(),
                 '80/20 Train R-Sq' = numeric(),
                 '80/20 Test R-Sq' = numeric())

daily_full <- daily_full %>%
  mutate(Refinery = str_replace_all(str_replace_all(Refinery, '[()]', ''), ' ', '_'),
         Monitor = str_replace_all(Monitor, ' ', '_'),
         weekday = as.character(weekday),
         daily_downwind_ref = as.integer(daily_downwind_ref),
         daily_downwind_wrp = as.integer(daily_downwind_wrp))

train <- daily_full[complete.cases(daily_full),] %>%
  filter(day >= '2022-02-01')

train <- fastDummies::dummy_cols(train %>%
                   select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                             monitor_lat, monitor_lon, county, dist_213)) %>%
                   mutate(MinDist = 1/(MinDist^2),
                          dist_wrp = 1/(dist_wrp^2)),
                   remove_selected_columns = TRUE)

# Try for a continuous month
tune_grid <- expand.grid(nrounds = c(100, 200, 500),
                         max_depth = c(3, 4, 5),
                         eta = c(0.1, 0.3),
                         gamma = c(0.01, 0.001),
                         colsample_bytree = c(0.5, 1),
                         min_child_weight = 0,
                         subsample = c(0.5, 0.75, 1))

# Run algorithms using 10-fold cross validation
control <- trainControl(method="cv", 
                        number=10,
                        verboseIter=TRUE, 
                        search='grid')

fit.xgb_da <- readRDS('rfiles/fit.xgb_da.rds')
# fit.xgb_da <- train(H2S_daily_avg~.,
#                  method = 'xgbTree',
#                  data = train,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da, 'rfiles/fit.xgb_da.rds')
getTrainPerf(fit.xgb_da)
fit.xgb_da$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.2 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "5", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 42 
## niter: 500
## nfeatures : 42 
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints year_2022 year_2023 month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 96     500         5 0.1  0.01              0.5                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da,scale=FALSE)
plot(imp, top = 20)

Get model performance at each fold

obs_xgb_sincefeb2022 <- c()
pred_xgb_sincefeb2022 <- c()
r2_test_xgb_sincefeb2022 <- c()

for (fold in seq(1, 10)) {
  test_fold <- train[fit.xgb_da$control$indexOut[[fold]], ]
  test_pred <- predict(fit.xgb_da$finalModel, 
                       newdata = test_fold %>% select(-H2S_daily_avg) %>% as.matrix())
  test_r2 <- postResample(test_pred, test_fold %>% pull(H2S_daily_avg))[[2]]
  
  obs_xgb_sincefeb2022 <- append(obs_xgb_sincefeb2022, test_fold %>% pull(H2S_daily_avg))
  pred_xgb_sincefeb2022 <- append(pred_xgb_sincefeb2022, test_pred)
  r2_test_xgb_sincefeb2022 <- append(r2_test_xgb_sincefeb2022, test_r2)
}
xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = obs_xgb_sincefeb2022, pred = pred_xgb_sincefeb2022),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Since 2022 XGBoost') +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Since Feb 2022',
                                  'Coef' = summary(lm(obs_xgb_sincefeb2022 ~
                                                        pred_xgb_sincefeb2022))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(obs_xgb_sincefeb2022 ~
                                                        pred_xgb_sincefeb2022))$r.squared))

SHAP for XGBoost above (Caret)

matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))  
xgb.plot.shap(data = matrix, 
              model = fit.xgb_da$finalModel, top_n = 12, n_col = 3)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 3.83e+05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 4000
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : span too small.  fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 40.97
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 2.03
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 4.1209
## Warning in sqrt(sum.squares/one.delta): NaNs produced

xgb.ggplot.shap.summary(data = matrix, 
              model = fit.xgb_da$finalModel, top_n = 20)

80/20 CV

set.seed(313)

validation_result_8020 <- tibble(Model = character(),
                                 'Coef' = character(),
                                 'R-Sq' = numeric())

train <- daily_full[complete.cases(daily_full),] %>%
  filter(day >= '2022-02-01') %>%
  slice_sample(prop = 0.8)

test <- anti_join(daily_full[complete.cases(daily_full),] %>% filter(day >= '2022-02-01'), 
                  train)
## Joining with `by = join_by(H2S_daily_max, H2S_daily_avg, H2S_monthly_average,
## monitor_lat, monitor_lon, mon_utm_x, mon_utm_y, Monitor, MinDist,
## Converted_Angle, Refinery, year, month, day, weekday, active_1km,
## monthly_oil_1km, monthly_gas_1km, dist_wrp, capacity, wrp_angle, dist_dc,
## dist_213, avg_temp, avg_hum, precip, ws_avg, wd_avg, daily_downwind_ref,
## daily_downwind_wrp, elevation, EVI, county, num_odor_complaints)`
train <- fastDummies::dummy_cols(train %>%
                   select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                             monitor_lat, monitor_lon, county, dist_213)) %>%
                   mutate(MinDist = 1/(MinDist^2),
                          dist_wrp = 1/(dist_wrp^2)),
                   remove_selected_columns = TRUE)

test <- fastDummies::dummy_cols(test %>%
                 select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                           monitor_lat, monitor_lon, county, dist_213)) %>%
                 mutate(MinDist = 1/(MinDist^2),
                        dist_wrp = 1/(dist_wrp^2)),
                 remove_selected_columns = TRUE)


# Try for a continuous month
tune_grid <- expand.grid(nrounds = c(100, 200, 500),
                         max_depth = c(3, 4, 5),
                         eta = c(0.1, 0.3),
                         gamma = c(0.01, 0.001),
                         colsample_bytree = c(0.5, 1),
                         min_child_weight = 0,
                         subsample = c(0.5, 0.75, 1))

# Run algorithms using 10-fold cross validation
control <- trainControl(method="cv", 
                        number=10,
                        verboseIter=TRUE, 
                        search='grid')

fit.xgb_da_8020 <- readRDS('rfiles/fit.xgb_da_8020.rds')
# fit.xgb_da_8020 <- train(H2S_daily_avg~.,
#                  method = 'xgbTree',
#                  data = train,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da_8020, 'rfiles/fit.xgb_da_8020.rds')
getTrainPerf(fit.xgb_da_8020)
# Test statistics
predictions <- predict(fit.xgb_da_8020$finalModel, 
                       newdata = test %>% select(-H2S_daily_avg) %>% as.matrix())
test_stat <- postResample(predictions, test %>% pull(H2S_daily_avg))
test_stat
##      RMSE  Rsquared       MAE 
## 0.2835988 0.7685410 0.1748351
xgb_result <- rbind(xgb_result, 
                    tibble(Model = 'Since Feb 2022',
                           'R-Sq' = getTrainPerf(fit.xgb_da)$TrainRsquared,
                           '80/20 Train R-Sq' = getTrainPerf(fit.xgb_da_8020)$TrainRsquared,
                           '80/20 Test R-Sq' = test_stat[[2]]))
xgb_sincefeb2022_obs_vs_pred_plot_8020 <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Since 2022 XGBoost') +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result_8020 <- rbind(validation_result_8020, 
                           tibble(Model = 'Since Feb 2022',
                                  'Coef' = summary(lm(test %>% pull(H2S_daily_avg) ~
                                                        predictions))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(test %>% pull(H2S_daily_avg) ~
                                                        predictions))$r.squared))
fit.xgb_da_8020$finalModel
## ##### xgb.Booster
## raw: 1.1 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "5", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## xgb.attributes:
##   niter
## # of features: 42 
## niter: 500
## nfeatures : 42 
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints year_2022 year_2023 month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 96     500         5 0.1  0.01              0.5                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_8020,scale=FALSE)
plot(imp, top = 20)

SHAP for XGBoost above (Caret)

matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))  
xgb.plot.shap(data = matrix, 
              model = fit.xgb_da_8020$finalModel, top_n = 12, n_col = 3)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 3.83e+05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 4000
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : Chernobyl! trL>n 9

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : Chernobyl! trL>n 9
## Warning in sqrt(sum.squares/one.delta): NaNs produced

xgb.ggplot.shap.summary(data = matrix, 
              model = fit.xgb_da_8020$finalModel, top_n = 20)

CV by leaving each monitor out once

post2022feb <- daily_full %>% 
  filter(day >= '2022-02-01')

monitor_names <- unique(post2022feb$Monitor)

# for (i in 1:length(monitor_names)) {
#   train <- post2022feb %>%

#     filter(Monitor != monitor_names[i])
# 
#   train <- train[complete.cases(train),]
# 
#   train <- fastDummies::dummy_cols(train %>%
#                      select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
#                              monitor_lat, monitor_lon, county, dist_213)) %>%
#                      mutate(MinDist = 1/(MinDist^2),
#                             dist_wrp = 1/(dist_wrp^2)),
#                      remove_selected_columns = TRUE)
# 
#   fit.xgb_da <- train(H2S_daily_avg~.,
#                  method = 'xgbTree',
#                  data = train,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# 
#   saveRDS(fit.xgb_da, paste0('rfiles/fit.xgb_da_', monitor_names[i], '.rds'))
# }
model_stats <- tibble(Monitor = character(), train_r2 = numeric(), test_r2 = numeric())

add_cols <- function(df, cols) {
  add <- cols[!cols %in% names(df)]
  if(length(add) !=0 ) df[add] <- NA
  return(df)
}

for (i in setdiff(1:length(monitor_names), c(9))) {

  fit.xgb_da <- readRDS(paste0('rfiles/fit.xgb_da_', monitor_names[i], '.rds'))

  test <- post2022feb %>%
    filter(Monitor == monitor_names[i])

  test <- fastDummies::dummy_cols(test[complete.cases(test),] %>%
                     ungroup() %>%
                     select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                             monitor_lat, monitor_lon, county, dist_213)) %>%
                       add_cols(c('year_2022', 'year_2023', 'month_01', 'month_02',
                                  'month_03', 'month_04', 'month_05', 'month_06',
                                  'month_07', 'month_08', 'month_09', 'month_10',
                                  'month_11', 'month_12')) %>%
                     mutate(MinDist = 1/(MinDist^2),
                            dist_wrp = 1/(dist_wrp^2),
                            year_2022 = ifelse(is.na(year_2022), 0, year_2022),
                            year_2023 = ifelse(is.na(year_2023), 0, year_2023),
                            month_01 = ifelse(is.na(month_01), 0, month_01),
                            month_02 = ifelse(is.na(month_02), 0, month_02),
                            month_03 = ifelse(is.na(month_03), 0, month_03),
                            month_04 = ifelse(is.na(month_04), 0, month_04),
                            month_05 = ifelse(is.na(month_05), 0, month_05),
                            month_06 = ifelse(is.na(month_06), 0, month_06),
                            month_07 = ifelse(is.na(month_07), 0, month_07),
                            month_08 = ifelse(is.na(month_08), 0, month_08),
                            month_09 = ifelse(is.na(month_09), 0, month_09),
                            month_10 = ifelse(is.na(month_10), 0, month_10),
                            month_11 = ifelse(is.na(month_11), 0, month_11),
                            month_12 = ifelse(is.na(month_12), 0, month_12)),
                     remove_selected_columns = TRUE)

  train_r2 <- getTrainPerf(fit.xgb_da)$TrainRsquared

  predictions <- predict(fit.xgb_da$finalModel,
                         newdata = test %>% select(-H2S_daily_avg) %>% as.matrix())

  test_r2 <- R2(test %>% pull(H2S_daily_avg), predictions)

  model_stats <- rbind(model_stats, tibble(Monitor = monitor_names[i], train_r2, test_r2))
}

model_stats
tibble(Monitor = 'Total Average', train_r2 = mean(model_stats$train_r2, na.rm = T),
                             test_r2 = mean(model_stats$test_r2, na.rm = T))

XGBoost Disaster

Daily average model

train <- daily_full[complete.cases(daily_full),] %>%
  filter(year == '2021', month %in% c('10', '11', '12'))

train <- fastDummies::dummy_cols(train %>%
                   select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                             monitor_lat, monitor_lon, county, dist_213, year)) %>%
                   mutate(MinDist = 1/(MinDist^2),
                          dist_wrp = 1/(dist_wrp^2)),
                   remove_selected_columns = TRUE)

fit.xgb_da_dis <- readRDS('rfiles/fit.xgb_da_dis.rds')
# fit.xgb_da_dis <- train(H2S_daily_avg~.,
#                  method = 'xgbTree',
#                  data = train,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da_dis, 'rfiles/fit.xgb_da_dis.rds')
getTrainPerf(fit.xgb_da_dis)
fit.xgb_da_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.1 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "5", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31 
## niter: 500
## nfeatures : 31 
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 96     500         5 0.1  0.01              0.5                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_dis,scale=FALSE)
plot(imp, top = 20)

Get model performance at each fold

obs_xgb_disaster <- c()
pred_xgb_disaster <- c()
r2_test_xgb_disaster <- c()

for (fold in seq(1, 10)) {
  test_fold <- train[fit.xgb_da_dis$control$indexOut[[fold]], ]
  test_pred <- predict(fit.xgb_da_dis$finalModel, 
                       newdata = test_fold %>% select(-H2S_daily_avg) %>% as.matrix())
  test_r2 <- postResample(test_pred, test_fold %>% pull(H2S_daily_avg))[[2]]
  
  obs_xgb_disaster <- append(obs_xgb_disaster, test_fold %>% pull(H2S_daily_avg))
  pred_xgb_disaster <- append(pred_xgb_disaster, test_pred)
  r2_test_xgb_disaster <- append(r2_test_xgb_disaster, test_r2)
}
xgb_disaster_obs_vs_pred_plot <- ggplot(tibble(obs = obs_xgb_disaster, pred = pred_xgb_disaster),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Disaster Only XGBoost') +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
xgb_disaster_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = obs_xgb_disaster, pred = pred_xgb_disaster),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(0, 50), ylim = c(0, 50)) +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Disaster Only',
                                  'Coef' = summary(lm(obs_xgb_disaster ~
                                                        pred_xgb_disaster))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(obs_xgb_disaster ~
                                                        pred_xgb_disaster))$r.squared))

SHAP for XGBoost above (Caret)

matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))
xgb.plot.shap(data = matrix,
              model = fit.xgb_da_dis$finalModel, top_n = 12, n_col = 3)

xgb.ggplot.shap.summary(data = matrix,
              model = fit.xgb_da_dis$finalModel, top_n = 20)

80/20 CV

set.seed(313)

train <- daily_full[complete.cases(daily_full),] %>%
  filter(year == '2021', month %in% c('10', '11', '12')) %>%
  slice_sample(prop = 0.8)

test <- anti_join(daily_full[complete.cases(daily_full),] %>% filter(year == '2021', month %in% c('10', '11', '12')), 
                  train)
## Joining with `by = join_by(H2S_daily_max, H2S_daily_avg, H2S_monthly_average,
## monitor_lat, monitor_lon, mon_utm_x, mon_utm_y, Monitor, MinDist,
## Converted_Angle, Refinery, year, month, day, weekday, active_1km,
## monthly_oil_1km, monthly_gas_1km, dist_wrp, capacity, wrp_angle, dist_dc,
## dist_213, avg_temp, avg_hum, precip, ws_avg, wd_avg, daily_downwind_ref,
## daily_downwind_wrp, elevation, EVI, county, num_odor_complaints)`
train <- fastDummies::dummy_cols(train %>%
                   select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                             monitor_lat, monitor_lon, county, dist_213, year)) %>%
                   mutate(MinDist = 1/(MinDist^2),
                          dist_wrp = 1/(dist_wrp^2)),
                   remove_selected_columns = TRUE)

test <- fastDummies::dummy_cols(test %>%
                 select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                           monitor_lat, monitor_lon, county, dist_213, year)) %>%
                 mutate(MinDist = 1/(MinDist^2),
                        dist_wrp = 1/(dist_wrp^2)),
                 remove_selected_columns = TRUE)

fit.xgb_da_dis_8020 <- readRDS('rfiles/fit.xgb_da_dis_8020.rds')
# fit.xgb_da_dis_8020 <- train(H2S_daily_avg~.,
#                  method = 'xgbTree',
#                  data = train,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da_dis_8020, 'rfiles/fit.xgb_da_dis_8020.rds')
getTrainPerf(fit.xgb_da_dis_8020)
# Test statistics
predictions <- predict(fit.xgb_da_dis_8020$finalModel, 
                       newdata = test %>% select(-H2S_daily_avg) %>% as.matrix())
test_stat <- postResample(predictions, 
             test %>% pull(H2S_daily_avg))
test_stat
##       RMSE   Rsquared        MAE 
## 23.4073974  0.6423252  6.3878800
xgb_result <- rbind(xgb_result, 
                    tibble(Model = 'Disaster Only',
                           'R-Sq' = getTrainPerf(fit.xgb_da_dis)$TrainRsquared,
                           '80/20 Train R-Sq' = getTrainPerf(fit.xgb_da_dis_8020)$TrainRsquared,
                           '80/20 Test R-Sq' = test_stat[[2]]))
xgb_dis_obs_vs_pred_plot_8020 <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Disaster Only XGBoost') +
                        theme_bw()

xgb_dis_obs_vs_pred_plot_zoom_8020 <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(0, 10), ylim = c(0, 10)) +
                        theme_bw()
validation_result_8020 <- rbind(validation_result_8020, 
                           tibble(Model = 'Disaster Only',
                                  'Coef' = summary(lm(test %>% pull(H2S_daily_avg) ~
                                                        predictions))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(test %>% pull(H2S_daily_avg) ~
                                                        predictions))$r.squared))
fit.xgb_da_dis_8020$finalModel
## ##### xgb.Booster
## raw: 209.9 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.3", max_depth = "5", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## xgb.attributes:
##   niter
## # of features: 31 
## niter: 100
## nfeatures : 31 
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 205     100         5 0.3  0.01              0.5                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_dis_8020,scale=FALSE)
plot(imp, top = 20)

SHAP for XGBoost above (Caret)

matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))  
xgb.plot.shap(data = matrix, 
              model = fit.xgb_da_dis_8020$finalModel, top_n = 12, n_col = 3)

xgb.ggplot.shap.summary(data = matrix, 
              model = fit.xgb_da_dis_8020$finalModel, top_n = 20)

XGBoost Full

Daily average model

train <- daily_full[complete.cases(daily_full),]

train <- fastDummies::dummy_cols(train %>%
                   select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                             monitor_lat, monitor_lon, county, dist_213, year)) %>%
                   mutate(MinDist = 1/(MinDist^2),
                          dist_wrp = 1/(dist_wrp^2)),
                   remove_selected_columns = TRUE)

fit.xgb_da_full <- readRDS('rfiles/fit.xgb_da_full.rds')
# fit.xgb_da_full <- train(H2S_daily_avg~.,
#                  method = 'xgbTree',
#                  data = train,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da_full, 'rfiles/fit.xgb_da_full.rds')
getTrainPerf(fit.xgb_da_full)
fit.xgb_da_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.2 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.3", max_depth = "5", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 40 
## niter: 500
## nfeatures : 40 
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 204     500         5 0.3  0.01              0.5                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_full,scale=FALSE)
plot(imp, top = 20)

Get model performance at each fold

obs_xgb_everything <- c()
pred_xgb_everything <- c()
r2_test_xgb_everything <- c()

for (fold in seq(1, 10)) {
  test_fold <- train[fit.xgb_da_full$control$indexOut[[fold]], ]
  test_pred <- predict(fit.xgb_da_full$finalModel, 
                       newdata = test_fold %>% select(-H2S_daily_avg) %>% as.matrix())
  test_r2 <- postResample(test_pred, test_fold %>% pull(H2S_daily_avg))[[2]]
  
  obs_xgb_everything <- append(obs_xgb_everything, test_fold %>% pull(H2S_daily_avg))
  pred_xgb_everything <- append(pred_xgb_everything, test_pred)
  r2_test_xgb_everything <- append(r2_test_xgb_everything, test_r2)
}
xgb_everything_obs_vs_pred_plot <- ggplot(tibble(obs = obs_xgb_everything, pred = pred_xgb_everything),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for everything XGBoost') +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
xgb_everything_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = obs_xgb_everything, pred = pred_xgb_everything),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result <- rbind(validation_result,  
                           tibble(Model = 'Everything w.o Disaster Indicator',
                                  'Coef' = summary(lm(obs_xgb_everything ~
                                                        pred_xgb_everything))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(obs_xgb_everything ~
                                                        pred_xgb_everything))$r.squared))

SHAP for XGBoost above (Caret)

matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))
xgb.plot.shap(data = matrix,
              model = fit.xgb_da_full$finalModel, top_n = 12, n_col = 3)

xgb.ggplot.shap.summary(data = matrix,
              model = fit.xgb_da_full$finalModel, top_n = 20)

80/20 CV

set.seed(313)

train <- rbind(
  daily_full[complete.cases(daily_full),] %>%
    filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>%
    slice_sample(prop = 0.8),
  daily_full[complete.cases(daily_full),] %>%
    filter(year == '2021', month %in% c('10', '11', '12')) %>%
    slice_sample(prop = 0.8)
)

test <- anti_join(daily_full[complete.cases(daily_full),], train)
## Joining with `by = join_by(H2S_daily_max, H2S_daily_avg, H2S_monthly_average,
## monitor_lat, monitor_lon, mon_utm_x, mon_utm_y, Monitor, MinDist,
## Converted_Angle, Refinery, year, month, day, weekday, active_1km,
## monthly_oil_1km, monthly_gas_1km, dist_wrp, capacity, wrp_angle, dist_dc,
## dist_213, avg_temp, avg_hum, precip, ws_avg, wd_avg, daily_downwind_ref,
## daily_downwind_wrp, elevation, EVI, county, num_odor_complaints)`
train <- fastDummies::dummy_cols(train %>%
                   select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                             monitor_lat, monitor_lon, county, dist_213, year)) %>%
                   mutate(MinDist = 1/(MinDist^2),
                          dist_wrp = 1/(dist_wrp^2)),
                   remove_selected_columns = TRUE)

test <- fastDummies::dummy_cols(test %>%
                 select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                           monitor_lat, monitor_lon, county, dist_213, year)) %>%
                 mutate(MinDist = 1/(MinDist^2),
                        dist_wrp = 1/(dist_wrp^2)),
                 remove_selected_columns = TRUE)

fit.xgb_da_full_8020 <- readRDS('rfiles/fit.xgb_da_full_8020.rds')
# fit.xgb_da_full_8020 <- train(H2S_daily_avg~.,
#                  method = 'xgbTree',
#                  data = train,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da_full_8020, 'rfiles/fit.xgb_da_full_8020.rds')
getTrainPerf(fit.xgb_da_full_8020)
# Test statistics
predictions <- predict(fit.xgb_da_full_8020$finalModel, 
                       newdata = test %>% select(-H2S_daily_avg) %>% as.matrix())
test_stat <- postResample(predictions, 
             test %>% pull(H2S_daily_avg))
test_stat
##      RMSE  Rsquared       MAE 
## 2.4223982 0.2317491 0.4381407
xgb_result <- rbind(xgb_result, 
                    tibble(Model = 'Everything w.o Disaster Indicator',
                           'R-Sq' = getTrainPerf(fit.xgb_da_full)$TrainRsquared,
                           '80/20 Train R-Sq' = getTrainPerf(fit.xgb_da_full_8020)$TrainRsquared,
                           '80/20 Test R-Sq' = test_stat[[2]]))
xgb_full_obs_vs_pred_plot_8020 <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Everything w.o Disaster Indicator XGBoost') +
                        theme_bw()

xgb_full_obs_vs_pred_plot_zoom_8020 <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(0, 10), ylim = c(0, 10)) +
                        theme_bw()
validation_result_8020 <- rbind(validation_result_8020, 
                           tibble(Model = 'Everything',
                                  'Coef' = summary(lm(test %>% pull(H2S_daily_avg) ~
                                                        predictions))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(test %>% pull(H2S_daily_avg) ~
                                                        predictions))$r.squared))
fit.xgb_da_full_8020$finalModel
## ##### xgb.Booster
## raw: 1.1 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "5", gamma = "0.001", colsample_bytree = "0.5", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## xgb.attributes:
##   niter
## # of features: 40 
## niter: 500
## nfeatures : 40 
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 81     500         5 0.1 0.001              0.5                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_full_8020,scale=FALSE)
plot(imp, top = 20)

SHAP for XGBoost above (Caret)

matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))  
xgb.plot.shap(data = matrix, 
              model = fit.xgb_da_full_8020$finalModel, top_n = 12, n_col = 3)

xgb.ggplot.shap.summary(data = matrix, 
              model = fit.xgb_da_full_8020$finalModel, top_n = 20)

XGBoost Result

xgb_result

Observed Vs. Predicted Plots 10-fold CV

ggarrange(xgb_sincefeb2022_obs_vs_pred_plot, 
          ggarrange(xgb_disaster_obs_vs_pred_plot, xgb_disaster_obs_vs_pred_plot_zoom,  
                    ncol = 2, labels = c("2", "3")),
          ggarrange(xgb_everything_obs_vs_pred_plot, xgb_everything_obs_vs_pred_plot_zoom, 
                    ncol = 2, labels = c("4", "5")), 
                    labels = c("1"),
                    nrow = 3)

knitr::kable(validation_result, digits = 3)
Model Coef R-Sq
Since Feb 2022 1.026 0.989
Disaster Only 0.675 0.666
Everything w.o Disaster Indicator 1.000 1.000

Observed Vs. Predicted Plots 10-fold CV + 80/20 split

ggarrange(xgb_sincefeb2022_obs_vs_pred_plot_8020, 
          ggarrange(xgb_dis_obs_vs_pred_plot_8020, xgb_dis_obs_vs_pred_plot_zoom_8020,  
                    ncol = 2, labels = c("2", "3")),
          ggarrange(xgb_full_obs_vs_pred_plot_8020, xgb_full_obs_vs_pred_plot_zoom_8020, 
                    ncol = 2, labels = c("4", "5")), 
                    labels = c("1"),
                    nrow = 3)

knitr::kable(validation_result_8020)
Model Coef R-Sq
Since Feb 2022 1.0204725 0.7685410
Disaster Only 0.7062446 0.6423252
Everything 0.9326755 0.2317491

Extreme Value Models

EVGAM

# for this part, I will fit from Jan-2021 to April-2022
ev_data <- daily_full %>%
  filter(day >= '2021-01-01',  day < '2022-05-01') %>%
  mutate(month = as.numeric(month),
         weekday = as.character(weekday),
         day = as.numeric(day),
         dist_wrp = I(1/dist_wrp^2),
         dist_dc = I(1/dist_dc^2),
         mon_utm_x = I(mon_utm_x/10^3), 
         mon_utm_y = I(mon_utm_y/10^3))
daily_max_gev <- list(H2S_daily_max ~ s(month,bs='cc') + year, 
                 ~ s(month,bs='cc') + year +
                   weekday + wd_avg + ws_avg + daily_downwind_ref + 
                   capacity + dist_wrp + 
                   s(mon_utm_x, mon_utm_y, bs='tp', k = 10) +  
                   te(mon_utm_x, mon_utm_y, day, 
                      k=c(10,10),d=c(2,1),bs=c('tp','cc')) + 
                   monthly_oil_1km + monthly_gas_1km + active_1km + 
                   daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                   dist_dc + avg_temp + avg_hum + precip, 
                 ~ 1)

dm_gev <- readRDS('rfiles/dm_gev.rds')
# dm_gev <- evgam(daily_max_gev, ev_data, family = "gev")
# saveRDS(dm_gev, 'rfiles/dm_gev.rds')
summary(dm_gev)
## 
## ** Parametric terms **
## 
## location
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)     1.39       0.02   57.56   <2e-16
## year2022        0.14       0.06    2.39  0.00846
## 
## logscale
##                     Estimate Std. Error       t value Pr(>|t|)
## (Intercept)            -6.61       1.78 -3.720000e+00 9.93e-05
## year2022                0.49       0.14  3.560000e+00 0.000184
## weekdayMon              0.05       0.05  9.800000e-01    0.162
## weekdaySat             -0.01       0.05 -1.900000e-01    0.424
## weekdaySun              0.00       0.05 -4.000000e-02    0.484
## weekdayThu              0.11       0.05  2.170000e+00    0.015
## weekdayTue              0.08       0.05  1.630000e+00   0.0516
## weekdayWed              0.03       0.05  5.100000e-01    0.303
## wd_avg                  0.00       0.00  2.300000e+00   0.0108
## ws_avg                  0.01       0.01  6.000000e-01    0.273
## daily_downwind_ref      0.07       0.05  1.400000e+00   0.0813
## capacity                0.00       0.00 -2.990000e+00   0.0014
## dist_wrp            45424.75       0.00  2.423312e+10   <2e-16
## monthly_oil_1km         0.00       0.00  7.040000e+00 9.96e-13
## monthly_gas_1km         0.00       0.00 -1.058000e+01   <2e-16
## active_1km              0.03       0.03  1.050000e+00    0.148
## daily_downwind_wrp     -0.03       0.06 -5.200000e-01    0.303
## elevation              -0.03       0.01 -3.160000e+00 0.000798
## EVI                    -0.46       0.15 -3.010000e+00  0.00131
## num_odor_complaints     0.05       0.01  8.660000e+00   <2e-16
## dist_dc              3098.92       0.00  2.100542e+07   <2e-16
## avg_temp                0.01       0.00  1.300000e+00   0.0962
## avg_hum                 0.00       0.00 -1.930000e+00   0.0267
## precip                 -0.06       0.08 -7.400000e-01     0.23
## 
## shape
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)     0.21       0.02   11.44   <2e-16
## 
## ** Smooth terms **
## 
## location
##          edf max.df Chi.sq Pr(>|t|)
## s(month) 6.2      8 375.69   <2e-16
## 
## logscale
##                               edf max.df Chi.sq Pr(>|t|)
## s(month)                     7.80      8 356.16   <2e-16
## s(mon_utm_x,mon_utm_y)       5.52      9  27.63 4.12e-05
## te(mon_utm_x,mon_utm_y,day) 54.02     80 290.64   <2e-16
plot(dm_gev)

MGCV

m1 <- readRDS('rfiles/m1.rds')
# m1 <- gam(list(H2S_daily_max ~ s(month,bs='cc') + year,
#                ~ s(month,bs='cc') + year +
#                    weekday + wd_avg + ws_avg + daily_downwind_ref +
#                    capacity + dist_wrp +
#                    s(mon_utm_x, mon_utm_y, bs='tp', k = 10) +
#                    te(mon_utm_x, mon_utm_y, day,
#                       k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
#                    monthly_oil_1km + monthly_gas_1km + active_1km +
#                    daily_downwind_wrp + elevation + EVI + num_odor_complaints +
#                    dist_dc + avg_temp + avg_hum + precip,
#                ~ 1),
#           data = ev_data, method = "REML",
#           family = gevlss(link = list("identity", "identity", "identity")))
# saveRDS(m1, 'rfiles/m1.rds')
summary(m1)
## 
## Family: gevlss 
## Link function: identity identity identity 
## 
## Formula:
## H2S_daily_max ~ s(month, bs = "cc") + year
## ~s(month, bs = "cc") + year + weekday + wd_avg + ws_avg + daily_downwind_ref + 
##     capacity + dist_wrp + s(mon_utm_x, mon_utm_y, bs = "tp", 
##     k = 10) + te(mon_utm_x, mon_utm_y, day, k = c(10, 10), d = c(2, 
##     1), bs = c("tp", "cc")) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km + daily_downwind_wrp + elevation + EVI + num_odor_complaints + 
##     dist_dc + avg_temp + avg_hum + precip
## ~1
## 
## Parametric coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            1.392e+00  2.397e-02  58.102  < 2e-16 ***
## year2022               1.361e-01  5.650e-02   2.408 0.016032 *  
## (Intercept).1         -6.973e+00  1.758e+00  -3.966 7.30e-05 ***
## year2022.1             4.965e-01  1.393e-01   3.565 0.000364 ***
## weekdayMon.1           5.013e-02  5.205e-02   0.963 0.335492    
## weekdaySat.1          -9.562e-03  4.864e-02  -0.197 0.844165    
## weekdaySun.1          -3.141e-03  4.897e-02  -0.064 0.948862    
## weekdayThu.1           1.112e-01  5.116e-02   2.173 0.029786 *  
## weekdayTue.1           8.063e-02  5.002e-02   1.612 0.106935    
## weekdayWed.1           2.570e-02  5.172e-02   0.497 0.619250    
## wd_avg.1               4.441e-04  1.908e-04   2.328 0.019930 *  
## ws_avg.1               5.815e-03  1.002e-02   0.580 0.561639    
## daily_downwind_ref.1   6.469e-02  4.793e-02   1.350 0.177101    
## capacity.1            -1.406e-03  5.051e-04  -2.785 0.005359 ** 
## dist_wrp.1            -3.640e+05  1.663e+06  -0.219 0.826789    
## monthly_oil_1km.1      8.399e-04  1.148e-04   7.313 2.62e-13 ***
## monthly_gas_1km.1     -2.707e-03  2.570e-04 -10.533  < 2e-16 ***
## active_1km.1           2.400e-02  2.542e-02   0.944 0.345128    
## daily_downwind_wrp.1  -3.218e-02  6.327e-02  -0.509 0.611053    
## elevation.1           -3.246e-02  1.071e-02  -3.031 0.002436 ** 
## EVI.1                 -4.050e-01  4.274e-01  -0.947 0.343434    
## num_odor_complaints.1  5.175e-02  5.954e-03   8.692  < 2e-16 ***
## dist_dc.1              3.003e+03  5.827e+02   5.154 2.55e-07 ***
## avg_temp.1             6.181e-03  4.631e-03   1.335 0.182010    
## avg_hum.1             -2.800e-03  1.439e-03  -1.946 0.051672 .  
## precip.1              -5.810e-02  7.946e-02  -0.731 0.464715    
## (Intercept).2          2.061e-01  1.778e-02  11.589  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                  edf Ref.df Chi.sq p-value    
## s(month)                       5.946  8.000 531.90 < 2e-16 ***
## s.1(month)                     7.743  8.000 372.15 < 2e-16 ***
## s.1(mon_utm_x,mon_utm_y)       4.972  5.554  22.42 0.00041 ***
## te.1(mon_utm_x,mon_utm_y,day) 53.079 80.000 380.71 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Deviance explained =   NA%
## -REML = 7658.9  Scale est. = 1         n = 3904
plot(m1, pages = 1, scheme = 1, scale = 0, seWithMean = TRUE)